-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] In VectorLegalizer::Expand, if UnrollVectorOp returns Load, … #88475
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: choikwa (choikwa) Changes…return only Load since other output is chain. Added testcase that showed mismatched expected arity when Load and chain were returned as separate items after 003b58f Full diff: https://github.com/llvm/llvm-project/pull/88475.diff 2 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 57a3f6a65e002c..9b268dac484df1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1159,8 +1159,12 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
}
SDValue Unrolled = DAG.UnrollVectorOp(Node);
- for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I)
- Results.push_back(Unrolled.getValue(I));
+ const LoadSDNode *Ld = dyn_cast<LoadSDNode>(Unrolled.getNode());
+ if (Ld)
+ Results.push_back(Unrolled);
+ else
+ for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I)
+ Results.push_back(Unrolled.getValue(I));
}
SDValue VectorLegalizer::ExpandSELECT(SDNode *Node) {
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 37412ac3aa5418..5b52e3aa70ab60 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -3,6 +3,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10,GFX1011,ALL
; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940,ALL
; ALL-LABEL: {{^}}build_vector2:
; R600: MOV
@@ -96,3 +97,100 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
store <2 x i16> %ins.1, ptr addrspace(1) %out
ret void
}
+
+; R600-LABEL: build_v2i32_from_v4i16_shuffle:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 0, @10, KC0[], KC1[]
+; R600-NEXT: TEX 1 @6
+; R600-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3
+; R600-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3
+; R600-NEXT: ALU clause starting at 10:
+; R600-NEXT: MOV * T0.X, 0.0,
+; R600-NEXT: ALU clause starting at 11:
+; R600-NEXT: LSHL * T0.Y, T1.X, literal.x,
+; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT: LSHL T0.X, T0.X, literal.x,
+; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX6: ; %bb.0: ; %entry
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_lshl_b32 s3, s3, 16
+; GFX6-NEXT: s_lshl_b32 s2, s2, 16
+; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, s0
+; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_lshl_b32 s0, s3, 16
+; GFX8-NEXT: s_lshl_b32 s1, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_lshl_b32 s2, s2, 16
+; GFX10-NEXT: s_lshl_b32 s3, s3, 16
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s3, s3, 16
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+;
+; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX940: ; %bb.0: ; %entry
+; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: s_lshl_b32 s3, s3, 16
+; GFX940-NEXT: s_lshl_b32 s2, s2, 16
+; GFX940-NEXT: v_mov_b32_e32 v0, s2
+; GFX940-NEXT: v_mov_b32_e32 v1, s3
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: s_endpgm
+define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
+entry:
+ %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
+ %zextended = zext <2 x i16> %shuf to <2 x i32>
+ %shifted = shl <2 x i32> %zextended, <i32 16, i32 16>
+ store <2 x i32> %shifted, ptr addrspace(1) %out
+ ret void
+}
+
|
You can test this locally with the following command:git-clang-format --diff dbaa1893c9afe6a245860efb8d68875ba4fd6794 dd27aa384c53dbf93dddd831c2c93202fa10326b -- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp View the diff from clang-format here.diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 7a9cfdf5c3..d611b30af8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1163,7 +1163,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
Results.push_back(Unrolled);
} else {
assert(Node->getNumValues() == Unrolled->getNumValues() &&
- "VectorLegalizer Expand returned wrong number of results!");
+ "VectorLegalizer Expand returned wrong number of results!");
for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I)
Results.push_back(Unrolled.getValue(I));
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
removing newline before eof, NFC |
…rn the result of UnrollVectorOp. Otherwise, check that Node and Unrolled has same number of values and return corresponding values. Added testcase that showed mismatched expected arity when Load and chain were returned as separate items after 003b58f
llvm#88475) …return only Load since other output is chain. Added testcase that showed mismatched expected arity when Load and chain were returned as separate items after 003b58f Change-Id: Id81b1092a95f4fd674e573e67a155823e7efac48
…return only Load since other output is chain.
Added testcase that showed mismatched expected arity when Load and chain were returned as separate items after 003b58f