-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[DAGCombiner] Combine vp.strided.store with unit stride to vp.store #66774
Conversation
SDValue DAGCombiner::visitVP_STRIDED_STORE(SDNode *N) { | ||
auto *SST = cast<VPStridedStoreSDNode>(N); | ||
EVT EltVT = SST->getValue().getValueType().getVectorElementType(); | ||
// Combine strided loads with unit-stride to a regular load. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"regular" load - a) same comment as load patch, and b) it's not a load (two occurrences in line).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM w/comment addressed
This is the VP equivalent of llvm#66677. If we have a strided store where the stride is equal to the element width, we can just use a regular VP store.
3b93fab
to
703068c
Compare
@llvm/pr-subscribers-llvm-selectiondag ChangesThis is the VP equivalent of #66677. If we have a strided store where the stride is equal to the element width, we can just use a regular VP store. I've added the tests in a separate commit in this PR so you can see the test diff. Full diff: https://github.com/llvm/llvm-project/pull/66774.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 20a89f24603d07b..db1ebe0e26b9a29 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -540,6 +540,7 @@ namespace {
SDValue visitVPGATHER(SDNode *N);
SDValue visitVPSCATTER(SDNode *N);
SDValue visitVP_STRIDED_LOAD(SDNode *N);
+ SDValue visitVP_STRIDED_STORE(SDNode *N);
SDValue visitFP_TO_FP16(SDNode *N);
SDValue visitFP16_TO_FP(SDNode *N);
SDValue visitFP_TO_BF16(SDNode *N);
@@ -11873,6 +11874,21 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
return SDValue();
}
+SDValue DAGCombiner::visitVP_STRIDED_STORE(SDNode *N) {
+ auto *SST = cast<VPStridedStoreSDNode>(N);
+ EVT EltVT = SST->getValue().getValueType().getVectorElementType();
+ // Combine strided stores with unit-stride to a regular VP store.
+ if (auto *CStride = dyn_cast<ConstantSDNode>(SST->getStride());
+ CStride && CStride->getZExtValue() == EltVT.getStoreSize()) {
+ return DAG.getStoreVP(SST->getChain(), SDLoc(N), SST->getValue(),
+ SST->getBasePtr(), SST->getOffset(), SST->getMask(),
+ SST->getVectorLength(), SST->getMemoryVT(),
+ SST->getMemOperand(), SST->getAddressingMode(),
+ SST->isTruncatingStore(), SST->isCompressingStore());
+ }
+ return SDValue();
+}
+
SDValue DAGCombiner::visitVPGATHER(SDNode *N) {
VPGatherSDNode *MGT = cast<VPGatherSDNode>(N);
SDValue Mask = MGT->getMask();
@@ -25997,6 +26013,10 @@ SDValue DAGCombiner::visitVPOp(SDNode *N) {
if (SDValue SD = visitVP_STRIDED_LOAD(N))
return SD;
+ if (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE)
+ if (SDValue SD = visitVP_STRIDED_STORE(N))
+ return SD;
+
// VP operations in which all vector elements are disabled - either by
// determining that the mask is all false or that the EVL is 0 - can be
// eliminated.
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
index 781be5f607da162..6c4960bd4078425 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpstore.ll
@@ -84,6 +84,16 @@ define void @strided_vpstore_v8i8(<8 x i8> %val, ptr %ptr, i32 signext %stride,
ret void
}
+define void @strided_vpstore_v8i8_unit_stride(<8 x i8> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v8i8_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vse8.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v8i8.p0.i32(<8 x i8> %val, ptr %ptr, i32 1, <8 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.v2i16.p0.i32(<2 x i16>, ptr, i32, <2 x i1>, i32)
define void @strided_vpstore_v2i16(<2 x i16> %val, ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
@@ -120,6 +130,16 @@ define void @strided_vpstore_v8i16(<8 x i16> %val, ptr %ptr, i32 signext %stride
ret void
}
+define void @strided_vpstore_v8i16_unit_stride(<8 x i16> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v8i16_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vse16.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v8i16.p0.i32(<8 x i16> %val, ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.v2i32.p0.i32(<2 x i32>, ptr, i32, <2 x i1>, i32)
define void @strided_vpstore_v2i32(<2 x i32> %val, ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
@@ -144,6 +164,16 @@ define void @strided_vpstore_v4i32(<4 x i32> %val, ptr %ptr, i32 signext %stride
ret void
}
+define void @strided_vpstore_v4i32_unit_stride(<4 x i32> %val, ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v4i32_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vse32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v4i32.p0.i32(<4 x i32> %val, ptr %ptr, i32 4, <4 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.v8i32.p0.i32(<8 x i32>, ptr, i32, <8 x i1>, i32)
define void @strided_vpstore_v8i32(<8 x i32> %val, ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
@@ -168,6 +198,16 @@ define void @strided_vpstore_v2i64(<2 x i64> %val, ptr %ptr, i32 signext %stride
ret void
}
+define void @strided_vpstore_v2i64_unit_stride(<2 x i64> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v2i64_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vse64.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v2i64.p0.i32(<2 x i64> %val, ptr %ptr, i32 8, <2 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.v4i64.p0.i32(<4 x i64>, ptr, i32, <4 x i1>, i32)
define void @strided_vpstore_v4i64(<4 x i64> %val, ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
@@ -228,6 +268,16 @@ define void @strided_vpstore_v8f16(<8 x half> %val, ptr %ptr, i32 signext %strid
ret void
}
+define void @strided_vpstore_v8f16_unit_stride(<8 x half> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v8f16_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vse16.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v8f16.p0.i32(<8 x half> %val, ptr %ptr, i32 2, <8 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.v2f32.p0.i32(<2 x float>, ptr, i32, <2 x i1>, i32)
define void @strided_vpstore_v2f32(<2 x float> %val, ptr %ptr, i32 signext %stride, <2 x i1> %m, i32 zeroext %evl) {
@@ -252,6 +302,16 @@ define void @strided_vpstore_v4f32(<4 x float> %val, ptr %ptr, i32 signext %stri
ret void
}
+define void @strided_vpstore_v4f32_unit_stride(<4 x float> %val, ptr %ptr, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v4f32_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vse32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v4f32.p0.i32(<4 x float> %val, ptr %ptr, i32 4, <4 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.v8f32.p0.i32(<8 x float>, ptr, i32, <8 x i1>, i32)
define void @strided_vpstore_v8f32(<8 x float> %val, ptr %ptr, i32 signext %stride, <8 x i1> %m, i32 zeroext %evl) {
@@ -276,6 +336,16 @@ define void @strided_vpstore_v2f64(<2 x double> %val, ptr %ptr, i32 signext %str
ret void
}
+define void @strided_vpstore_v2f64_unit_stride(<2 x double> %val, ptr %ptr, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_v2f64_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vse64.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.v2f64.p0.i32(<2 x double> %val, ptr %ptr, i32 8, <2 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.v4f64.p0.i32(<4 x double>, ptr, i32, <4 x i1>, i32)
define void @strided_vpstore_v4f64(<4 x double> %val, ptr %ptr, i32 signext %stride, <4 x i1> %m, i32 zeroext %evl) {
@@ -343,10 +413,10 @@ define void @strided_store_v32f64(<32 x double> %v, ptr %ptr, i32 signext %strid
; CHECK: # %bb.0:
; CHECK-NEXT: li a4, 16
; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB27_2
+; CHECK-NEXT: bltu a2, a4, .LBB34_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 16
-; CHECK-NEXT: .LBB27_2:
+; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t
; CHECK-NEXT: mul a3, a3, a1
@@ -369,10 +439,10 @@ define void @strided_store_v32f64_allones_mask(<32 x double> %v, ptr %ptr, i32 s
; CHECK: # %bb.0:
; CHECK-NEXT: li a4, 16
; CHECK-NEXT: mv a3, a2
-; CHECK-NEXT: bltu a2, a4, .LBB28_2
+; CHECK-NEXT: bltu a2, a4, .LBB35_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a3, 16
-; CHECK-NEXT: .LBB28_2:
+; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1
; CHECK-NEXT: mul a3, a3, a1
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index b7b9769e82d1b69..cf6ce89b9b5a469 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -96,6 +96,16 @@ define void @strided_vpstore_nxv8i8(<vscale x 8 x i8> %val, ptr %ptr, i32 signex
ret void
}
+define void @strided_vpstore_nxv8i8_unit_stride(<vscale x 8 x i8> %val, ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv8i8_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vse8.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32(<vscale x 8 x i8> %val, ptr %ptr, i32 1, <vscale x 8 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.nxv1i16.p0.i32(<vscale x 1 x i16>, ptr, i32, <vscale x 1 x i1>, i32)
define void @strided_vpstore_nxv1i16(<vscale x 1 x i16> %val, ptr %ptr, i32 signext %strided, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -132,6 +142,16 @@ define void @strided_vpstore_nxv4i16(<vscale x 4 x i16> %val, ptr %ptr, i32 sign
ret void
}
+define void @strided_vpstore_nxv4i16_unit_stride(<vscale x 4 x i16> %val, ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv4i16_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vse16.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.nxv4i16.p0.i32(<vscale x 4 x i16> %val, ptr %ptr, i32 2, <vscale x 4 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.nxv8i16.p0.i32(<vscale x 8 x i16>, ptr, i32, <vscale x 8 x i1>, i32)
define void @strided_vpstore_nxv8i16(<vscale x 8 x i16> %val, ptr %ptr, i32 signext %strided, <vscale x 8 x i1> %m, i32 zeroext %evl) {
@@ -180,6 +200,16 @@ define void @strided_vpstore_nxv4i32(<vscale x 4 x i32> %val, ptr %ptr, i32 sign
ret void
}
+define void @strided_vpstore_nxv4i32_unit_stride(<vscale x 4 x i32> %val, ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv4i32_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vse32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> %val, ptr %ptr, i32 4, <vscale x 4 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.nxv8i32.p0.i32(<vscale x 8 x i32>, ptr, i32, <vscale x 8 x i1>, i32)
define void @strided_vpstore_nxv8i32(<vscale x 8 x i32> %val, ptr %ptr, i32 signext %strided, <vscale x 8 x i1> %m, i32 zeroext %evl) {
@@ -204,6 +234,16 @@ define void @strided_vpstore_nxv1i64(<vscale x 1 x i64> %val, ptr %ptr, i32 sign
ret void
}
+define void @strided_vpstore_nxv1i64_unit_stride(<vscale x 1 x i64> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv1i64_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vse64.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.nxv1i64.p0.i32(<vscale x 1 x i64> %val, ptr %ptr, i32 8, <vscale x 1 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32(<vscale x 2 x i64>, ptr, i32, <vscale x 2 x i1>, i32)
define void @strided_vpstore_nxv2i64(<vscale x 2 x i64> %val, ptr %ptr, i32 signext %strided, <vscale x 2 x i1> %m, i32 zeroext %evl) {
@@ -276,6 +316,16 @@ define void @strided_vpstore_nxv4f16(<vscale x 4 x half> %val, ptr %ptr, i32 sig
ret void
}
+define void @strided_vpstore_nxv4f16_unit_stride(<vscale x 4 x half> %val, ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv4f16_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vse16.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.nxv4f16.p0.i32(<vscale x 4 x half> %val, ptr %ptr, i32 2, <vscale x 4 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.nxv8f16.p0.i32(<vscale x 8 x half>, ptr, i32, <vscale x 8 x i1>, i32)
define void @strided_vpstore_nxv8f16(<vscale x 8 x half> %val, ptr %ptr, i32 signext %strided, <vscale x 8 x i1> %m, i32 zeroext %evl) {
@@ -324,6 +374,16 @@ define void @strided_vpstore_nxv4f32(<vscale x 4 x float> %val, ptr %ptr, i32 si
ret void
}
+define void @strided_vpstore_nxv4f32_unit_stride(<vscale x 4 x float> %val, ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv4f32_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT: vse32.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.nxv4f32.p0.i32(<vscale x 4 x float> %val, ptr %ptr, i32 4, <vscale x 4 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.nxv8f32.p0.i32(<vscale x 8 x float>, ptr, i32, <vscale x 8 x i1>, i32)
define void @strided_vpstore_nxv8f32(<vscale x 8 x float> %val, ptr %ptr, i32 signext %strided, <vscale x 8 x i1> %m, i32 zeroext %evl) {
@@ -348,6 +408,16 @@ define void @strided_vpstore_nxv1f64(<vscale x 1 x double> %val, ptr %ptr, i32 s
ret void
}
+define void @strided_vpstore_nxv1f64_unit_stride(<vscale x 1 x double> %val, ptr %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: strided_vpstore_nxv1f64_unit_stride:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT: vse64.v v8, (a0), v0.t
+; CHECK-NEXT: ret
+ call void @llvm.experimental.vp.strided.store.nxv1f64.p0.i32(<vscale x 1 x double> %val, ptr %ptr, i32 8, <vscale x 1 x i1> %m, i32 %evl)
+ ret void
+}
+
declare void @llvm.experimental.vp.strided.store.nxv2f64.p0.i32(<vscale x 2 x double>, ptr, i32, <vscale x 2 x i1>, i32)
define void @strided_vpstore_nxv2f64(<vscale x 2 x double> %val, ptr %ptr, i32 signext %strided, <vscale x 2 x i1> %m, i32 zeroext %evl) {
@@ -427,10 +497,10 @@ define void @strided_store_nxv16f64(<vscale x 16 x double> %v, ptr %ptr, i32 sig
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a3, vlenb
; CHECK-NEXT: mv a4, a2
-; CHECK-NEXT: bltu a2, a3, .LBB34_2
+; CHECK-NEXT: bltu a2, a3, .LBB41_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: .LBB34_2:
+; CHECK-NEXT: .LBB41_2:
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1, v0.t
; CHECK-NEXT: sub a5, a2, a3
@@ -454,10 +524,10 @@ define void @strided_store_nxv16f64_allones_mask(<vscale x 16 x double> %v, ptr
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a3, vlenb
; CHECK-NEXT: mv a4, a2
-; CHECK-NEXT: bltu a2, a3, .LBB35_2
+; CHECK-NEXT: bltu a2, a3, .LBB42_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a3
-; CHECK-NEXT: .LBB35_2:
+; CHECK-NEXT: .LBB42_2:
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vsse64.v v8, (a0), a1
; CHECK-NEXT: sub a3, a2, a3
@@ -485,15 +555,15 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
; CHECK-NEXT: slli a6, a4, 1
; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: mv a5, a3
-; CHECK-NEXT: bltu a3, a6, .LBB36_2
+; CHECK-NEXT: bltu a3, a6, .LBB43_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a5, a6
-; CHECK-NEXT: .LBB36_2:
+; CHECK-NEXT: .LBB43_2:
; CHECK-NEXT: mv a7, a5
-; CHECK-NEXT: bltu a5, a4, .LBB36_4
+; CHECK-NEXT: bltu a5, a4, .LBB43_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a7, a4
-; CHECK-NEXT: .LBB36_4:
+; CHECK-NEXT: .LBB43_4:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr t0, vlenb
@@ -521,10 +591,10 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
; CHECK-NEXT: addi a3, a3, -1
; CHECK-NEXT: and a0, a3, a0
; CHECK-NEXT: vsse64.v v16, (a7), a2, v0.t
-; CHECK-NEXT: bltu a0, a4, .LBB36_6
+; CHECK-NEXT: bltu a0, a4, .LBB43_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a0, a4
-; CHECK-NEXT: .LBB36_6:
+; CHECK-NEXT: .LBB43_6:
; CHECK-NEXT: mul a3, a5, a2
; CHECK-NEXT: add a1, a1, a3
; CHECK-NEXT: srli a4, a4, 2
|
This is the VP equivalent of #66677. If we have a strided store where the stride is equal to the element width, we can just use a regular VP store.
I've added the tests in a separate commit in this PR so you can see the test diff.