AMDGPU: Handle llvm.amdgcn.buffer.{load|store}.v2i16 intrinsics

kzhuravl · searlmc1 · commit 7d9678262ba7 · 2020-07-13T10:48:18.000-04:00
Cherry-pick of patchset #1 https://reviews.llvm.org/D83249 Needed for 3.7, SWDEV-231101 Change-Id: I04ce9e8353526aa187ab854892f11d3a0f28c03b
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6313,8 +6313,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                  M, DAG, Ops);
 
     // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
-    if (LoadVT.getScalarType() == MVT::i8 ||
-        LoadVT.getScalarType() == MVT::i16)
+    if (!LoadVT.isVector() &&
+        (LoadVT.getScalarType() == MVT::i8 ||
+         LoadVT.getScalarType() == MVT::i16))
       return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
 
     return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
@@ -6957,7 +6958,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
     EVT VDataType = VData.getValueType().getScalarType();
-    if (VDataType == MVT::i8 || VDataType == MVT::i16)
+    if (!VData.getValueType().isVector() &&
+        (VDataType == MVT::i8 || VDataType == MVT::i16))
       return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-store.v2i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-store.v2i16.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
+
+declare <2 x i16> @llvm.amdgcn.buffer.load.v2i16(<4 x i32>, i32, i32, i1 immarg, i1 immarg)
+declare void @llvm.amdgcn.buffer.store.v2i16(<2 x i16>, <4 x i32>, i32, i32, i1 immarg, i1 immarg)
+
+define hidden <2 x i16> @buffer_load_v2i16(i16* %0, i32 %1, i32 %2) local_unnamed_addr {
+; GFX900-LABEL: buffer_load_v2i16:
+; GFX900:       buffer_load_v2i16$local:
+; GFX900-NEXT:  ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, -1
+; GFX900-NEXT:    v_mov_b32_e32 v5, 0x27000
+; GFX900-NEXT:    v_add_lshl_u32 v3, v3, v2, 1
+; GFX900-NEXT:    s_mov_b64 s[6:7], exec
+; GFX900-NEXT:  BB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX900-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX900-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX900-NEXT:    v_readfirstlane_b32 s11, v5
+; GFX900-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX900-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
+; GFX900-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_dword v2, v3, s[8:11], 0 offen
+; GFX900-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz BB0_1
+; GFX900-NEXT:  ; %bb.2:
+; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+  %4 = ptrtoint i16* %0 to i64
+  %5 = bitcast i64 %4 to <2 x i32>
+  %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %7 = shufflevector <4 x i32> %6, <4 x i32> <i32 undef, i32 undef, i32 -1, i32 159744>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %8 = add i32 %2, %1
+  %9 = shl i32 %8, 1
+  %10 = tail call <2 x i16> @llvm.amdgcn.buffer.load.v2i16(<4 x i32> %7, i32 0, i32 %9, i1 zeroext false, i1 zeroext false)
+  ret <2 x i16> %10
+}
+
+define hidden void @buffer_store_v2i16(i16* nocapture readonly %0, i16* %1, i32 %2, i32 %3) local_unnamed_addr {
+; GFX900-LABEL: buffer_store_v2i16:
+; GFX900:       buffer_store_v2i16$local:
+; GFX900-NEXT:  ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    flat_load_ushort v1, v[0:1]
+; GFX900-NEXT:    v_add_lshl_u32 v0, v5, v4, 1
+; GFX900-NEXT:    v_mov_b32_e32 v6, -1
+; GFX900-NEXT:    v_mov_b32_e32 v7, 0x27000
+; GFX900-NEXT:    s_mov_b64 s[6:7], exec
+; GFX900-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff, v1
+; GFX900-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
+; GFX900-NEXT:  BB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX900-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s10, v6
+; GFX900-NEXT:    v_readfirstlane_b32 s11, v7
+; GFX900-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
+; GFX900-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[10:11], v[6:7]
+; GFX900-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX900-NEXT:    s_nop 0
+; GFX900-NEXT:    buffer_store_dword v1, v0, s[8:11], 0 offen
+; GFX900-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz BB1_1
+; GFX900-NEXT:  ; %bb.2:
+; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+  %5 = ptrtoint i16* %1 to i64
+  %6 = bitcast i64 %5 to <2 x i32>
+  %7 = shufflevector <2 x i32> %6, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %8 = shufflevector <4 x i32> %7, <4 x i32> <i32 undef, i32 undef, i32 -1, i32 159744>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %9 = load i16, i16* %0, align 2
+  %10 = insertelement <2 x i16> undef, i16 %9, i32 0
+  %11 = shufflevector <2 x i16> %10, <2 x i16> undef, <2 x i32> zeroinitializer
+  %12 = add i32 %3, %2
+  %13 = shl i32 %12, 1
+  tail call void @llvm.amdgcn.buffer.store.v2i16(<2 x i16> %11, <4 x i32> %8, i32 0, i32 %13, i1 zeroext false, i1 zeroext false)
+  ret void
+}