Skip to content

Commit

Permalink
AMDGPU: Don't avoid clamp of bit shift in BFE pattern (#115372)
Browse files Browse the repository at this point in the history
Enable pattern matching from "x<<32-y>>32-y" to "bfe x, 0, y" when we
know y is in [0,31].
This is the follow-up for the PR:
#114279 to fix the issue:
#114282
  • Loading branch information
changpeng authored Nov 7, 2024
1 parent 7475156 commit bdf8e30
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 33 deletions.
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Target/TargetMachine.h"

Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3553,6 +3553,23 @@ def : AMDGPUPat <
(V_BFE_U32_e64 $src, (i32 0), $width)
>;

def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{
return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5;
}]>;

// x << (bitwidth - y) >> (bitwidth - y)
def : AMDGPUPat <
(DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
(sub 32, uint5Bits:$width)),
(V_BFE_U32_e64 $src, (i32 0), $width)
>;

def : AMDGPUPat <
(DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, uint5Bits:$width)),
(sub 32, uint5Bits:$width)),
(V_BFE_I32_e64 $src, (i32 0), $width)
>;

// SHA-256 Ma patterns

// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
Expand Down
33 changes: 17 additions & 16 deletions llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, v3, v2
; SI-NEXT: v_and_b32_e32 v3, 31, v3
; SI-NEXT: v_bfe_u32 v2, v2, 0, v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
Expand All @@ -38,9 +37,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
; VI-NEXT: v_and_b32_e32 v2, 31, v4
; VI-NEXT: v_bfe_u32 v2, v3, 0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -49,7 +47,8 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
%sub = sub i32 32, %width
%width5 = and i32 %width, 31
%sub = sub i32 32, %width5
%shl = shl i32 %src, %sub
%bfe = lshr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
Expand All @@ -72,6 +71,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_and_b32_e32 v3, 31, v3
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; SI-NEXT: v_lshrrev_b32_e32 v3, v3, v2
Expand All @@ -95,7 +95,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_and_b32_e32 v2, 31, v4
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v2
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; VI-NEXT: v_lshrrev_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
Expand All @@ -108,7 +109,8 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
%sub = sub i32 32, %width
%width5 = and i32 %width, 31
%sub = sub i32 32, %width5
%shl = shl i32 %src, %sub
%bfe = lshr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
Expand Down Expand Up @@ -219,9 +221,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: v_sub_i32_e32 v3, vcc, 32, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, v3, v2
; SI-NEXT: v_ashrrev_i32_e32 v2, v3, v2
; SI-NEXT: v_and_b32_e32 v3, 31, v3
; SI-NEXT: v_bfe_i32 v2, v2, 0, v3
; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_endpgm
;
Expand All @@ -240,9 +241,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; VI-NEXT: v_ashrrev_i32_e32 v2, v2, v3
; VI-NEXT: v_and_b32_e32 v2, 31, v4
; VI-NEXT: v_bfe_i32 v2, v3, 0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -251,7 +251,8 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
%src = load volatile i32, ptr addrspace(1) %in0.gep
%width = load volatile i32, ptr addrspace(1) %in0.gep
%sub = sub i32 32, %width
%width5 = and i32 %width, 31
%sub = sub i32 32, %width5
%shl = shl i32 %src, %sub
%bfe = ashr i32 %shl, %sub
store i32 %bfe, ptr addrspace(1) %out.gep
Expand Down
24 changes: 8 additions & 16 deletions llvm/test/CodeGen/AMDGPU/extract-lowbits.ll
Original file line number Diff line number Diff line change
Expand Up @@ -150,22 +150,14 @@ define i32 @bzhi32_c4_commutative(i32 %val, i32 %numlowbits) nounwind {
; ---------------------------------------------------------------------------- ;

define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind {
; SI-LABEL: bzhi32_d0:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bzhi32_d0:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
%numhighbits = sub i32 32, %numlowbits
; GCN-LABEL: bzhi32_d0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v1, 31, v1
; GCN-NEXT: v_bfe_u32 v0, v0, 0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
%numlow5bits = and i32 %numlowbits, 31
%numhighbits = sub i32 32, %numlow5bits
%highbitscleared = shl i32 %val, %numhighbits
%masked = lshr i32 %highbitscleared, %numhighbits
ret i32 %masked
Expand Down

0 comments on commit bdf8e30

Please sign in to comment.