-
Notifications
You must be signed in to change notification settings - Fork 12.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Stop reserving $vcc_hi in wave32 mode #87783
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesThis gives us one extra SGPR to play with. The comment suggested that it Patch is 67.46 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/87783.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index fa77b94fc22def..8f0eae362ecae0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -642,6 +642,17 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackLaneMasks = true;
}
+void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const {
+ if (isWave32()) {
+ // Fix implicit $vcc operands after MIParser has verified that they match
+ // the instruction definitions.
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB)
+ InstrInfo.fixImplicitOperands(MI);
+ }
+ }
+}
+
bool GCNSubtarget::hasMadF16() const {
return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4da10beabe3162..e24a18a2842f62 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -923,6 +923,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
void overrideSchedPolicy(MachineSchedPolicy &Policy,
unsigned NumRegionInstrs) const override;
+ void mirFileLoaded(MachineFunction &MF) const override;
+
unsigned getMaxNumUserSGPRs() const {
return AMDGPU::getMaxNumUserSGPRs(*this);
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 245731ad5fc7c9..acb54fd10b90dc 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -612,13 +612,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Reserve null register - it shall never be allocated
reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
- // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
- // will result in bugs.
- if (isWave32) {
- Reserved.set(AMDGPU::VCC);
- Reserved.set(AMDGPU::VCC_HI);
- }
-
// Reserve SGPRs.
//
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index bf4302c156d83d..4c9c34de7194ce 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38342,12 +38342,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
@@ -38366,7 +38365,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s17, 1, v4
; GFX10-NEXT: v_cmp_eq_u32_e64 s18, 1, v2
; GFX10-NEXT: v_cmp_eq_u32_e64 s19, 1, v0
-; GFX10-NEXT: v_writelane_b32 v40, s35, 3
+; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_cmp_eq_u32_e64 s20, 1, v27
; GFX10-NEXT: v_cmp_eq_u32_e64 s21, 1, v25
; GFX10-NEXT: v_cmp_eq_u32_e64 s22, 1, v23
@@ -38377,10 +38376,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s27, 1, v13
; GFX10-NEXT: v_cmp_eq_u32_e64 s28, 1, v11
; GFX10-NEXT: v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v3
-; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v1
-; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v5
-; GFX10-NEXT: v_cmp_eq_u32_e64 s35, 1, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_hi, 1, v3
+; GFX10-NEXT: v_cmp_eq_u32_e64 s30, 1, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s31, 1, v5
+; GFX10-NEXT: v_cmp_eq_u32_e64 s34, 1, v9
; GFX10-NEXT: s_waitcnt vmcnt(32)
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v31
; GFX10-NEXT: s_waitcnt vmcnt(31)
@@ -38460,10 +38459,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cndmask_b32_e64 v6, v29, v39, s27
; GFX10-NEXT: v_cndmask_b32_e64 v5, v28, v26, s28
; GFX10-NEXT: v_cndmask_b32_e64 v20, v51, v20, s29
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s31
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, s30
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s34
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s35
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v14, v12, s30
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v55, v16, vcc_hi
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v53, v18, s31
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v24, v22, s34
; GFX10-NEXT: v_cndmask_b32_e64 v16, v4, v3, s4
; GFX10-NEXT: v_perm_b32 v0, v0, v64, 0x5040100
; GFX10-NEXT: v_perm_b32 v1, v1, v54, 0x5040100
@@ -38481,7 +38480,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_perm_b32 v13, v66, v13, 0x5040100
; GFX10-NEXT: v_perm_b32 v14, v65, v17, 0x5040100
; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT: v_readlane_b32 s35, v40, 3
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index c48231f3851a74..29621a0477418d 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -586,7 +586,7 @@ name: dpp_reg_sequence_both_combined
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc_lo
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
@@ -606,12 +606,12 @@ body: |
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
# GCN: %5:vreg_64 = REG_SEQUENCE undef %3:vgpr_32, %subreg.sub0, %4, %subreg.sub1
# GCN: %6:vgpr_32 = V_ADD_U32_dpp %8, %1.sub0, %2, 1, 15, 15, 1, implicit $exec
-# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
+# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
name: dpp_reg_sequence_first_combined
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc_lo
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
@@ -636,7 +636,7 @@ name: dpp_reg_sequence_second_combined
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc_lo
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
@@ -656,12 +656,12 @@ body: |
# GCN: %4:vgpr_32 = V_MOV_B32_dpp %0.sub1, %1.sub1, 1, 1, 1, 1, implicit $exec
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec
-# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
+# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
name: dpp_reg_sequence_none_combined
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc_lo
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
@@ -683,12 +683,12 @@ body: |
# GCN: S_BRANCH %bb.1
# GCN: bb.1:
# GCN: %6:vgpr_32 = V_ADD_U32_e32 %5.sub0, %2, implicit $exec
-# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
+# GCN: %7:vgpr_32 = V_ADDC_U32_e32 %5.sub1, %2, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
name: dpp_reg_sequence_exec_changed
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc_lo
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
@@ -699,6 +699,7 @@ body: |
S_BRANCH %bb.1
bb.1:
+ liveins: $vcc_lo
%6:vgpr_32 = V_ADD_U32_e32 %4.sub0, %5, implicit $exec
%7:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %5, implicit-def $vcc, implicit $vcc, implicit $exec
...
@@ -712,12 +713,12 @@ body: |
# GCN: %5:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
# GCN: %6:vreg_64 = REG_SEQUENCE %5.sub0, %subreg.sub0, %5.sub1, %subreg.sub1
# GCN: %7:vgpr_32 = V_ADD_U32_e32 %6.sub0, %2, implicit $exec
-# GCN: %8:vgpr_32 = V_ADDC_U32_e32 %6.sub1, %2, implicit-def $vcc, implicit $vcc, implicit $exec
+# GCN: %8:vgpr_32 = V_ADDC_U32_e32 %6.sub1, %2, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
name: dpp_reg_sequence_subreg
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc_lo
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
@@ -782,6 +783,7 @@ name: dpp64_add64_impdef
tracksRegLiveness: true
body: |
bb.0:
+ liveins: $vcc_lo
%0:vreg_64 = IMPLICIT_DEF
%1:vreg_64 = IMPLICIT_DEF
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO %1:vreg_64, %0:vreg_64, 1, 15, 15, 1, implicit $exec
@@ -796,6 +798,7 @@ name: dpp64_add64_undef
tracksRegLiveness: true
body: |
bb.0:
+ liveins: $vcc_lo
%2:vreg_64 = V_MOV_B64_DPP_PSEUDO undef %1:vreg_64, undef %0:vreg_64, 1, 15, 15, 1, implicit $exec
%5:vgpr_32 = V_ADD_U32_e32 %2.sub0, undef %4:vgpr_32, implicit $exec
%6:vgpr_32 = V_ADDC_U32_e32 %2.sub1, undef %4, implicit-def $vcc, implicit $vcc, implicit $exec
@@ -860,12 +863,12 @@ body: |
# GCN-LABEL: name: dont_combine_more_than_one_operand_dpp_reg_sequence
# GCN: %5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec
-# GCN: %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
+# GCN: %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
name: dont_combine_more_than_one_operand_dpp_reg_sequence
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vcc_lo
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
%2:vgpr_32 = V_MOV_B32_dpp %0.sub0, %1.sub0, 1, 15, 15, 1, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index ec3c08ec795235..da64c379672ef7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -1259,17 +1259,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
; GFX10SELDAG-LABEL: isnan_v4f16:
; GFX10SELDAG: ; %bb.0:
; GFX10SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v2, 3
-; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v0, 3
-; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v0, 3
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s4, v1, 3
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v0, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v5
+; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10SELDAG-NEXT: v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v4
; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v0, v4
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX10SELDAG-NEXT: v_cmp_class_f16_e64 s5, v1, 3
-; GFX10SELDAG-NEXT: v_mov_b32_e32 v1, v5
-; GFX10SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5
; GFX10SELDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10GLISEL-LABEL: isnan_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index ab6a9dcf71acef..a87fa8bf36d9e7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -7404,35 +7404,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
; GFX12-NEXT: v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
; GFX12-NEXT: v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
-; GFX12-NEXT: s_lshr_b32 s12, s0, 16
-; GFX12-NEXT: s_mov_b32 s14, s1
-; GFX12-NEXT: s_lshr_b32 s16, s1, 16
-; GFX12-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
; GFX12-NEXT: s_lshr_b32 s2, s2, 16
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
+; GFX12-NEXT: s_mov_b32 s12, s1
+; GFX12-NEXT: s_lshr_b32 s14, s1, 16
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
; GFX12-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
; GFX12-NEXT: v_mov_b32_e32 v18, s20
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:80
; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:64
-; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
; GFX12-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
-; GFX12-NEXT: v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14
-; GFX12-NEXT: v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16
+; GFX12-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
+; GFX12-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14
; GFX12-NEXT: v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
-; GFX12-NEXT: v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12
+; GFX12-NEXT: v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0
; GFX12-NEXT: s_clause 0x5
; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:112
; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:96
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 952827b8cd0e71..889755c23bbc72 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -8808,73 +8808,73 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v2, 8, s6
; GFX12-NEXT: v_lshrrev_b16 v4, 8, s5
; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: s_lshr_b32 s24, s7, 16
+; GFX12-NEXT: s_lshr_b32 s22, s7, 16
; GFX12-NEXT: v_bfe_i32 v31, v1, 0, 8
-; GFX12-NEXT: s_lshr_b32 s42, s2, 24
-; GFX12-NEXT: s_mov_b32 s48, s7
+; GFX12-NEXT: s_lshr_b32 s40, s2, 24
+; GFX12-NEXT: s_mov_b32 s46, s7
; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
-; GFX12-NEXT: s_lshr_b32 s26, s6, 16
-; GFX12-NEXT: s_lshr_b32 s44, s1, 16
+; GFX12-NEXT: s_lshr_b32 s24, s6, 16
+; GFX12-NEXT: s_lshr_b32 s42, s1, 16
; GFX12-NEXT: s_ashr_i64 s[58:59], s[6:7], 56
-; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX12-NEXT: v_lshrrev_b16 v6, 8, s3
; GFX12-NEXT: v_lshrrev_b16 v3, 8, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24
-; GFX12-NEXT: s_lshr_b32 s28, s6, 24
-; GFX12-NEXT: s_lshr_b32 s30, s5, 16
-; GFX12-NEXT: s_lshr_b32 s40, s2, 16
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22
+; GFX12-NEXT: s_lshr_b32 s26, s6, 24
+; GFX12-NEXT: s_lshr_b32 s28, s5, 16
+; GFX12-NEXT: s_lshr_b32 s38, s2, 16
; GFX12-NEXT: v_bfe_i32 v11, v8, 0, 8
; GFX12-NEXT: v_bfe_i32 v23, v4, 0, 8
; GFX12-NEXT: v_bfe_i32 v27, v2, 0, 8
; GFX12-NEXT: v_ashrrev_i32_e32 v32, 31, v31
-; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58
-; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26
-; GFX12-NEXT: v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48
-; GFX12-NEXT: v_mov_b32_e32 v30, s49
-; GFX12-NEXT: s_lshr_b32 s46, s0, 24
-; GFX12-NEXT: s_mov_b32 s50, s5
-; GFX12-NEXT: s_mov_b32 s52, s3
-; GFX12-NEXT: s_lshr_b32 s34, s4, 16
-; GFX12-NEXT: s_lshr_b32 s36, s4, 24
-; GFX12-NEXT: s_ashr_i64 s[22:23], s[2:3], 56
+; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58
+; GFX12-NEXT: v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24
+; GFX12-NEXT: v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46
+; GFX12-NEXT: v_mov_b32_e32 v30, s47
+; GFX12-NEXT: s_lshr_b32 s44, s0, 24
+; GFX12-NEXT: s_mov_b32 s48, s5
+; GFX12-NEXT: s_mov_b32 s50, s3
+; GFX12-NEXT: s_lshr_b32 s30, s4, 16
+; GFX12-NEXT: s_lshr_b32 s34, s4, 24
+; GFX12-NEXT: s_ashr_i64 s[54:55], s[2:3], 56
; GFX12-NEXT: s_ashr_i64 s[56:57], s[4:5], 56
; GFX12-NEXT: v_bfe_i32 v7, v7, 0, 8
; GFX12-NEXT: v_bfe_i32 v19, v5, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX12-NEXT: s_lshr_b32 s38, s3, 16
-; GFX12-NEXT: s_mov_b32 s54, s1
+; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT: s_lshr_b32 s36, s3, 16
+; GFX12-NEXT: s_mov_b32 s52, s1
; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x80000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[52:53], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[46:47], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[50:51], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[4:5], s[48:49], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x80000
; GFX12-NEXT: s_lshr_b32 s20, s0, 16
; GFX12-NEXT: s_ashr_i64 s[18:19], s[0:1], 56
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX12-NEXT: v_bfe_i32 v15, v6, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28
-; GFX12-NEXT: v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30
-; GFX12-NEXT: v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56
-; GFX12-NEXT: v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34
-; GFX12-NEXT: v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40
-; GFX12-NEXT: v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42
+; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v38, s25 ::...
[truncated]
|
The first commit is #87781. Please only review the second commit. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've also been wondering what happens if you unreserve exec_hi. Are writes discarded? can you use it as a free SGPR?
This gives us one extra SGPR to play with. The comment suggested that it could cause bugs, but I have tested it with Vulkan CTS with the default wave size for compute shaders set to 32 and did not find any problems.
15fb186
to
33cb662
Compare
This gives us one extra SGPR to play with. The comment suggested that it
could cause bugs, but I have tested it with Vulkan CTS with the default
wave size for compute shaders set to 32 and did not find any problems.