diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index e3a330d45aaa57..21fffba14287ef 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -22,6 +22,7 @@ #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" #include "llvm/Analysis/UniformityAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 5ae0b179d7d0e6..11c4cdd560c2f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -17,7 +17,6 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "SIModeRegisterDefaults.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 25df5dabdc6aa1..5f4cca0645b0ef 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3556,23 +3556,6 @@ def : AMDGPUPat < (V_BFE_U32_e64 $src, (i32 0), $width) >; -def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{ - return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5; -}]>; - -// x << (bitwidth - y) >> (bitwidth - y) -def : AMDGPUPat < - (DivergentBinFrag (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)), - (sub 32, uint5Bits:$width)), - (V_BFE_U32_e64 $src, (i32 0), $width) ->; - -def : AMDGPUPat < - (DivergentBinFrag (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)), - (sub 32, uint5Bits:$width)), - (V_BFE_I32_e64 $src, (i32 0), $width) ->; - // SHA-256 Ma patterns // ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index 18d19673995115..ce54ad5c9a6a82 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -17,8 +17,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: v_and_b32_e32 v3, 31, v3 -; SI-NEXT: v_bfe_u32 v2, v2, 0, v3 +; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -37,8 +38,9 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_and_b32_e32 v2, 31, v4 -; VI-NEXT: v_bfe_u32 v2, v3, 0, v2 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -47,8 +49,7 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x %src = load volatile i32, ptr addrspace(1) %in0.gep %width = load volatile i32, ptr addrspace(1) %in0.gep - %width5 = and i32 %width, 31 - %sub = sub i32 32, %width5 + %sub = sub i32 32, %width %shl = shl i32 %src, %sub %bfe = lshr i32 %shl, %sub store i32 %bfe, ptr addrspace(1) %out.gep @@ -71,7 +72,6 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_and_b32_e32 v3, 31, v3 ; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2 @@ -95,8 +95,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_and_b32_e32 v2, 31, v4 -; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v2 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 ; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 ; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -109,8 +108,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x %src = load volatile i32, ptr addrspace(1) %in0.gep %width = load volatile i32, ptr addrspace(1) %in0.gep - %width5 = and i32 %width, 31 - %sub = sub i32 32, %width5 + %sub = sub i32 32, %width %shl = shl i32 %src, %sub %bfe = lshr i32 %shl, %sub store i32 %bfe, ptr addrspace(1) %out.gep @@ -221,8 +219,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: v_and_b32_e32 v3, 31, v3 -; SI-NEXT: v_bfe_i32 v2, v2, 0, v3 +; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -241,8 +240,9 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_and_b32_e32 v2, 31, v4 -; VI-NEXT: v_bfe_i32 v2, v3, 0, v2 +; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3 +; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -251,8 +251,7 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x %src = load volatile i32, ptr addrspace(1) %in0.gep %width = load volatile i32, ptr addrspace(1) %in0.gep - %width5 = and i32 %width, 31 - %sub = sub i32 32, %width5 + %sub = sub i32 32, %width %shl = shl i32 %src, %sub %bfe = ashr i32 %shl, %sub store i32 %bfe, ptr addrspace(1) %out.gep diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll index 0e5a68773a6ba8..7f1f7133d69919 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll @@ -150,39 +150,47 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind { ; ---------------------------------------------------------------------------- ; define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { -; GCN-LABEL: bzhi32_d0: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 31, v1 -; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1 -; GCN-NEXT: s_setpc_b64 s[30:31] - %numlow5bits = and i32 %numlowbits, 31 - %numhighbits = sub i32 32, %numlow5bits +; SI-LABEL: bzhi32_d0: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bzhi32_d0: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %numhighbits = sub i32 32, %numlowbits %highbitscleared = shl i32 %val, %numhighbits %masked = lshr i32 %highbitscleared, %numhighbits ret i32 %masked } -define i32 @bzhi32_d0_even(i32 %val, i32 %numlowbits) nounwind { -; SI-LABEL: bzhi32_d0_even: +define i32 @bzhi32_d0_5bits(i32 %val, i32 %numlowbits) nounwind { +; SI-LABEL: bzhi32_d0_5bits: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; SI-NEXT: v_and_b32_e32 v1, 31, v1 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 ; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; -; VI-LABEL: bzhi32_d0_even: +; VI-LABEL: bzhi32_d0_5bits: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; VI-NEXT: v_and_b32_e32 v1, 31, v1 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 ; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 ; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] - %times2 = shl i32 %numlowbits, 1 - %numhighbits = sub i32 32, %times2 + %numlow5bits = and i32 %numlowbits, 31 + %numhighbits = sub i32 32, %numlow5bits %highbitscleared = shl i32 %val, %numhighbits %masked = lshr i32 %highbitscleared, %numhighbits ret i32 %masked