Skip to content

Commit

Permalink
[AMDGPU] Stop reserving $vcc_hi in wave32 mode
Browse files Browse the repository at this point in the history
This gives us one extra SGPR to play with. The comment suggested that it
could cause bugs, but I have tested it with Vulkan CTS with the default
wave size for compute shaders set to 32 and did not find any problems.
  • Loading branch information
jayfoad committed Apr 16, 2024
1 parent d34a2c2 commit 33cb662
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 99 deletions.
7 changes: 0 additions & 7 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -612,13 +612,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Reserve null register - it shall never be allocated
reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);

// Disallow vcc_hi allocation in wave32. It may be allocated but most likely
// will result in bugs.
if (isWave32) {
Reserved.set(AMDGPU::VCC);
Reserved.set(AMDGPU::VCC_HI);
}

// Reserve SGPRs.
//
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
Expand Down
22 changes: 10 additions & 12 deletions llvm/test/CodeGen/AMDGPU/bf16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38342,12 +38342,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
Expand All @@ -38366,7 +38365,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
; GFX10-NEXT: v_writelane_b32 v40, s35, 3
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
Expand All @@ -38377,10 +38376,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v3
; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v5
; GFX10-NEXT: v_cmp_eq_u32_e64 s35, 1, v9
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
; GFX10-NEXT: s_waitcnt vmcnt(32)
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
; GFX10-NEXT: s_waitcnt vmcnt(31)
Expand Down Expand Up @@ -38460,10 +38459,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s31
; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, s30
; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s34
; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s35
; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
Expand All @@ -38481,7 +38480,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
; GFX10-NEXT: v_readlane_b32 s35, v40, 3
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
Expand Down
20 changes: 10 additions & 10 deletions llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1259,17 +1259,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
; GFX10SELDAG-LABEL: isnan_v4f16:
; GFX10SELDAG: ; %bb.0:
; GFX10SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10SELDAG-NEXT: v_mov_b32_e32 v2, 3
; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v0, 3
; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v0, 3
; GFX10SELDAG-NEXT: v_mov_b32_e32 v3, 3
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v1, 3
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v0, v3 src0_sel:WORD_1 src1_sel:DWORD
; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v5
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD
; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v4
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v4
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v1, 3
; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v5
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
; GFX10SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10GLISEL-LABEL: isnan_v4f16:
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7404,35 +7404,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
; GFX12-NEXT: s_lshr_b32 s12, s0, 16
; GFX12-NEXT: s_mov_b32 s14, s1
; GFX12-NEXT: s_lshr_b32 s16, s1, 16
; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
; GFX12-NEXT: s_lshr_b32 s2, s2, 16
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
; GFX12-NEXT: s_mov_b32 s12, s1
; GFX12-NEXT: s_lshr_b32 s14, s1, 16
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
; GFX12-NEXT: v_mov_b32_e32 v18, s20
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64
; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
; GFX12-NEXT: v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14
; GFX12-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16
; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
; GFX12-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14
; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
; GFX12-NEXT: v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12
; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96
Expand Down
90 changes: 45 additions & 45 deletions llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8808,90 +8808,90 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5
; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
; GFX12-NEXT: s_lshr_b32 s24, s7, 16
; GFX12-NEXT: s_lshr_b32 s22, s7, 16
; GFX12-NEXT: v_bfe_i32 v31, v1, 0, 8
; GFX12-NEXT: s_lshr_b32 s42, s2, 24
; GFX12-NEXT: s_mov_b32 s48, s7
; GFX12-NEXT: s_lshr_b32 s40, s2, 24
; GFX12-NEXT: s_mov_b32 s46, s7
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
; GFX12-NEXT: s_lshr_b32 s26, s6, 16
; GFX12-NEXT: s_lshr_b32 s44, s1, 16
; GFX12-NEXT: s_lshr_b32 s24, s6, 16
; GFX12-NEXT: s_lshr_b32 s42, s1, 16
; GFX12-NEXT: s_ashr_i64 s[58:59], s[6:7], 56
; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX12-NEXT: v_lshrrev_b16 v6, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v3, 8, s0
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24
; GFX12-NEXT: s_lshr_b32 s28, s6, 24
; GFX12-NEXT: s_lshr_b32 s30, s5, 16
; GFX12-NEXT: s_lshr_b32 s40, s2, 16
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22
; GFX12-NEXT: s_lshr_b32 s26, s6, 24
; GFX12-NEXT: s_lshr_b32 s28, s5, 16
; GFX12-NEXT: s_lshr_b32 s38, s2, 16
; GFX12-NEXT: v_bfe_i32 v11, v8, 0, 8
; GFX12-NEXT: v_bfe_i32 v23, v4, 0, 8
; GFX12-NEXT: v_bfe_i32 v27, v2, 0, 8
; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31
; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58
; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26
; GFX12-NEXT: v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48
; GFX12-NEXT: v_mov_b32_e32 v30, s49
; GFX12-NEXT: s_lshr_b32 s46, s0, 24
; GFX12-NEXT: s_mov_b32 s50, s5
; GFX12-NEXT: s_mov_b32 s52, s3
; GFX12-NEXT: s_lshr_b32 s34, s4, 16
; GFX12-NEXT: s_lshr_b32 s36, s4, 24
; GFX12-NEXT: s_ashr_i64 s[22:23], s[2:3], 56
; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58
; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24
; GFX12-NEXT: v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46
; GFX12-NEXT: v_mov_b32_e32 v30, s47
; GFX12-NEXT: s_lshr_b32 s44, s0, 24
; GFX12-NEXT: s_mov_b32 s48, s5
; GFX12-NEXT: s_mov_b32 s50, s3
; GFX12-NEXT: s_lshr_b32 s30, s4, 16
; GFX12-NEXT: s_lshr_b32 s34, s4, 24
; GFX12-NEXT: s_ashr_i64 s[54:55], s[2:3], 56
; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56
; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 8
; GFX12-NEXT: v_bfe_i32 v19, v5, 0, 8
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
; GFX12-NEXT: s_lshr_b32 s38, s3, 16
; GFX12-NEXT: s_mov_b32 s54, s1
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
; GFX12-NEXT: s_lshr_b32 s36, s3, 16
; GFX12-NEXT: s_mov_b32 s52, s1
; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
; GFX12-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000
; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000
; GFX12-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000
; GFX12-NEXT: s_bfe_i64 s[2:3], s[50:51], 0x80000
; GFX12-NEXT: s_bfe_i64 s[4:5], s[48:49], 0x80000
; GFX12-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x80000
; GFX12-NEXT: s_lshr_b32 s20, s0, 16
; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-NEXT: v_bfe_i32 v15, v6, 0, 8
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28
; GFX12-NEXT: v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30
; GFX12-NEXT: v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56
; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34
; GFX12-NEXT: v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40
; GFX12-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v38, s25 :: v_dual_mov_b32 v39, s26
; GFX12-NEXT: v_dual_mov_b32 v40, s27 :: v_dual_mov_b32 v41, s28
; GFX12-NEXT: v_dual_mov_b32 v42, s29 :: v_dual_mov_b32 v43, s56
; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s30
; GFX12-NEXT: v_dual_mov_b32 v52, s55 :: v_dual_mov_b32 v53, s38
; GFX12-NEXT: v_dual_mov_b32 v54, s39 :: v_dual_mov_b32 v55, s40
; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000
; GFX12-NEXT: s_bfe_i64 s[0:1], s[54:55], 0x80000
; GFX12-NEXT: s_bfe_i64 s[0:1], s[52:53], 0x80000
; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX12-NEXT: global_store_b128 v0, v[33:36], s[8:9] offset:240
; GFX12-NEXT: v_mov_b32_e32 v33, s44
; GFX12-NEXT: v_mov_b32_e32 v33, s42
; GFX12-NEXT: global_store_b128 v0, v[29:32], s[8:9] offset:224
; GFX12-NEXT: v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17
; GFX12-NEXT: v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4
; GFX12-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v17, s14
; GFX12-NEXT: v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12
; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36
; GFX12-NEXT: v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38
; GFX12-NEXT: v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v46, s31 :: v_dual_mov_b32 v47, s34
; GFX12-NEXT: v_dual_mov_b32 v48, s35 :: v_dual_mov_b32 v49, s36
; GFX12-NEXT: v_dual_mov_b32 v34, s43 :: v_dual_mov_b32 v35, s18
; GFX12-NEXT: v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20
; GFX12-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GFX12-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; GFX12-NEXT: v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2
; GFX12-NEXT: v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10
; GFX12-NEXT: v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22
; GFX12-NEXT: v_dual_mov_b32 v50, s37 :: v_dual_mov_b32 v51, s54
; GFX12-NEXT: v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6
; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX12-NEXT: v_ashrrev_i32_e32 v16, 31, v15
Expand Down
Loading

0 comments on commit 33cb662

Please sign in to comment.