Skip to content

Commit 7d96782

Browse files
kzhuravlsearlmc1
authored andcommitted
AMDGPU: Handle llvm.amdgcn.buffer.{load|store}.v2i16 intrinsics
Cherry-pick of patchset #1 https://reviews.llvm.org/D83249 Needed for 3.7, SWDEV-231101 Change-Id: I04ce9e8353526aa187ab854892f11d3a0f28c03b
1 parent 430e96b commit 7d96782

File tree

2 files changed

+90
-3
lines changed

2 files changed

+90
-3
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -6313,8 +6313,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
63136313
M, DAG, Ops);
63146314

63156315
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6316-
if (LoadVT.getScalarType() == MVT::i8 ||
6317-
LoadVT.getScalarType() == MVT::i16)
6316+
if (!LoadVT.isVector() &&
6317+
(LoadVT.getScalarType() == MVT::i8 ||
6318+
LoadVT.getScalarType() == MVT::i16))
63186319
return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
63196320

63206321
return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
@@ -6957,7 +6958,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
69576958

69586959
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
69596960
EVT VDataType = VData.getValueType().getScalarType();
6960-
if (VDataType == MVT::i8 || VDataType == MVT::i16)
6961+
if (!VData.getValueType().isVector() &&
6962+
(VDataType == MVT::i8 || VDataType == MVT::i16))
69616963
return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
69626964

69636965
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
3+
4+
declare <2 x i16> @llvm.amdgcn.buffer.load.v2i16(<4 x i32>, i32, i32, i1 immarg, i1 immarg)
5+
declare void @llvm.amdgcn.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i1 immarg, i1 immarg)
6+
7+
define hidden <2 x i16> @buffer_load_v2i16(i16* %0, i32 %1, i32 %2) local_unnamed_addr {
8+
; GFX900-LABEL: buffer_load_v2i16:
9+
; GFX900: buffer_load_v2i16$local:
10+
; GFX900-NEXT: ; %bb.0:
11+
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12+
; GFX900-NEXT: v_mov_b32_e32 v4, -1
13+
; GFX900-NEXT: v_mov_b32_e32 v5, 0x27000
14+
; GFX900-NEXT: v_add_lshl_u32 v3, v3, v2, 1
15+
; GFX900-NEXT: s_mov_b64 s[6:7], exec
16+
; GFX900-NEXT: BB0_1: ; =>This Inner Loop Header: Depth=1
17+
; GFX900-NEXT: v_readfirstlane_b32 s8, v0
18+
; GFX900-NEXT: v_readfirstlane_b32 s9, v1
19+
; GFX900-NEXT: v_readfirstlane_b32 s10, v4
20+
; GFX900-NEXT: v_readfirstlane_b32 s11, v5
21+
; GFX900-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
22+
; GFX900-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
23+
; GFX900-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
24+
; GFX900-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
25+
; GFX900-NEXT: s_waitcnt vmcnt(0)
26+
; GFX900-NEXT: buffer_load_dword v2, v3, s[8:11], 0 offen
27+
; GFX900-NEXT: s_xor_b64 exec, exec, s[4:5]
28+
; GFX900-NEXT: s_cbranch_execnz BB0_1
29+
; GFX900-NEXT: ; %bb.2:
30+
; GFX900-NEXT: s_mov_b64 exec, s[6:7]
31+
; GFX900-NEXT: s_waitcnt vmcnt(0)
32+
; GFX900-NEXT: v_mov_b32_e32 v0, v2
33+
; GFX900-NEXT: s_setpc_b64 s[30:31]
34+
%4 = ptrtoint i16* %0 to i64
35+
%5 = bitcast i64 %4 to <2 x i32>
36+
%6 = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
37+
%7 = shufflevector <4 x i32> %6, <4 x i32> <i32 undef, i32 undef, i32 -1, i32 159744>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
38+
%8 = add i32 %2, %1
39+
%9 = shl i32 %8, 1
40+
%10 = tail call <2 x i16> @llvm.amdgcn.buffer.load.v2i16(<4 x i32> %7, i32 0, i32 %9, i1 zeroext false, i1 zeroext false)
41+
ret <2 x i16> %10
42+
}
43+
44+
define hidden void @buffer_store_v2i16(i16* nocapture readonly %0, i16* %1, i32 %2, i32 %3) local_unnamed_addr {
45+
; GFX900-LABEL: buffer_store_v2i16:
46+
; GFX900: buffer_store_v2i16$local:
47+
; GFX900-NEXT: ; %bb.0:
48+
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49+
; GFX900-NEXT: flat_load_ushort v1, v[0:1]
50+
; GFX900-NEXT: v_add_lshl_u32 v0, v5, v4, 1
51+
; GFX900-NEXT: v_mov_b32_e32 v6, -1
52+
; GFX900-NEXT: v_mov_b32_e32 v7, 0x27000
53+
; GFX900-NEXT: s_mov_b64 s[6:7], exec
54+
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
55+
; GFX900-NEXT: v_and_b32_e32 v4, 0xffff, v1
56+
; GFX900-NEXT: v_lshl_or_b32 v1, v1, 16, v4
57+
; GFX900-NEXT: BB1_1: ; =>This Inner Loop Header: Depth=1
58+
; GFX900-NEXT: v_readfirstlane_b32 s8, v2
59+
; GFX900-NEXT: v_readfirstlane_b32 s9, v3
60+
; GFX900-NEXT: v_readfirstlane_b32 s10, v6
61+
; GFX900-NEXT: v_readfirstlane_b32 s11, v7
62+
; GFX900-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
63+
; GFX900-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
64+
; GFX900-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
65+
; GFX900-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
66+
; GFX900-NEXT: s_nop 0
67+
; GFX900-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen
68+
; GFX900-NEXT: s_xor_b64 exec, exec, s[4:5]
69+
; GFX900-NEXT: s_cbranch_execnz BB1_1
70+
; GFX900-NEXT: ; %bb.2:
71+
; GFX900-NEXT: s_mov_b64 exec, s[6:7]
72+
; GFX900-NEXT: s_waitcnt vmcnt(0)
73+
; GFX900-NEXT: s_setpc_b64 s[30:31]
74+
%5 = ptrtoint i16* %1 to i64
75+
%6 = bitcast i64 %5 to <2 x i32>
76+
%7 = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
77+
%8 = shufflevector <4 x i32> %7, <4 x i32> <i32 undef, i32 undef, i32 -1, i32 159744>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
78+
%9 = load i16, i16* %0, align 2
79+
%10 = insertelement <2 x i16> undef, i16 %9, i32 0
80+
%11 = shufflevector <2 x i16> %10, <2 x i16> undef, <2 x i32> zeroinitializer
81+
%12 = add i32 %3, %2
82+
%13 = shl i32 %12, 1
83+
tail call void @llvm.amdgcn.buffer.store.v2i16(<2 x i16> %11, <4 x i32> %8, i32 0, i32 %13, i1 zeroext false, i1 zeroext false)
84+
ret void
85+
}

0 commit comments

Comments
 (0)