-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SVE] Lower unpredicated loads/stores as fixed LDR/STR with -msve-vector-bits=128. #127500
base: main
Are you sure you want to change the base?
[AArch64][SVE] Lower unpredicated loads/stores as fixed LDR/STR with -msve-vector-bits=128. #127500
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Ricardo Jesus (rj-jesus) ChangesGiven the code below: svuint8_t foo(uint8_t *x) {
return svld1(svptrue_b8(), x);
} When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we foo:
ptrue p0.b
ld1b { z0.b }, p0/z, [x0]
ret Whereas (on little-endian) we could instead be using LDR as follows: foo:
ldr q0, [x0]
ret Besides avoiding the predicate dependency, the above form enables I have a patch that enables similar folds for SVE LDR/STR, but since that Full diff: https://github.com/llvm/llvm-project/pull/127500.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4263be1098899..173a875a256e0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23550,6 +23550,31 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
return DAG.getMergeValues({Extract, TokenFactor}, DL);
}
+// Replace scalable loads with fixed loads when vscale_range(1, 1).
+// This enables further optimisations such as LDP folds.
+static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ EVT MemVT = LD->getMemoryVT();
+ if (!MemVT.isScalableVector() ||
+ Subtarget->getMaxSVEVectorSizeInBits() != AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ // Skip unpacked types given their different layouts between Neon and SVE.
+ if (MemVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ SDLoc DL(LD);
+ MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
+ MemVT.getVectorMinNumElements());
+ SDValue NewLoad = DAG.getLoad(
+ NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+ LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
+ SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
+ SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ {SDValue(cast<SDNode>(NewLoad), 1)});
+ return DAG.getMergeValues({Insert, TokenFactor}, DL);
+}
+
// Perform TBI simplification if supported by the target and try to break up
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
// load instructions can be selected.
@@ -23587,6 +23612,9 @@ static SDValue performLOADCombine(SDNode *N,
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
return Res;
+ if (SDValue Res = combineVScale1Load(LD, DAG, Subtarget))
+ return Res;
+
if (!LD->isNonTemporal())
return SDValue(N, 0);
@@ -23845,6 +23873,30 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
return Chain;
}
+// Replace scalable stores with fixed stores when vscale_range(1, 1).
+static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
+ SDValue Value = ST->getValue();
+ EVT ValueVT = Value.getValueType();
+ if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
+ !ValueVT.isScalableVector() ||
+ Subtarget->getMaxSVEVectorSizeInBits() != AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ // Skip unpacked types given their different layouts between Neon and SVE.
+ if (ValueVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ SDLoc DL(ST);
+ MVT NewVT = MVT::getVectorVT(ValueVT.getVectorElementType().getSimpleVT(),
+ ValueVT.getVectorMinNumElements());
+ SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
+ SDValue NewStore = DAG.getStore(
+ ST->getChain(), DL, NewValue, ST->getBasePtr(), ST->getPointerInfo(),
+ ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ return NewStore;
+}
+
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -23879,6 +23931,9 @@ static SDValue performSTORECombine(SDNode *N,
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
return Res;
+ if (SDValue Res = combineVScale1Store(ST, DAG, Subtarget))
+ return Res;
+
// If this is an FP_ROUND followed by a store, fold this into a truncating
// store. We can do this even if this is already a truncstore.
// We purposefully don't care about legality of the nodes here as we know
diff --git a/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
new file mode 100644
index 0000000000000..f2d4933d43259
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
@@ -0,0 +1,483 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -aarch64-sve-vector-bits-max=0 < %s | FileCheck %s --check-prefix=CHECK-VLA
+; RUN: llc -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i8> @ld_nxv16i8(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv16i8:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.b
+; CHECK-VLA-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv16i8:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr q0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 16 x i8>, ptr %0, align 16
+ ret <vscale x 16 x i8> %2
+}
+
+define void @st_nxv16i8(ptr %0, <vscale x 16 x i8> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv16i8:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.b
+; CHECK-VLA-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv16i8:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str q0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 16 x i8> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 8 x i16> @ld_nxv8i16(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8i16:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.h
+; CHECK-VLA-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv8i16:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr q0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 8 x i16>, ptr %0, align 16
+ ret <vscale x 8 x i16> %2
+}
+
+define void @st_nxv8i16(ptr %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8i16:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.h
+; CHECK-VLA-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv8i16:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str q0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 8 x i16> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 4 x i32> @ld_nxv4i32(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4i32:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.s
+; CHECK-VLA-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv4i32:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr q0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 4 x i32>, ptr %0, align 16
+ ret <vscale x 4 x i32> %2
+}
+
+define void @st_nxv4i32(ptr %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4i32:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.s
+; CHECK-VLA-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv4i32:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str q0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 4 x i32> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 2 x i64> @ld_nxv2i64(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2i64:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.d
+; CHECK-VLA-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv2i64:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr q0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 2 x i64>, ptr %0, align 16
+ ret <vscale x 2 x i64> %2
+}
+
+define void @st_nxv2i64(ptr %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2i64:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.d
+; CHECK-VLA-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv2i64:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str q0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 2 x i64> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 8 x half> @ld_nxv8f16(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8f16:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.h
+; CHECK-VLA-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv8f16:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr q0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 8 x half>, ptr %0, align 16
+ ret <vscale x 8 x half> %2
+}
+
+define void @st_nxv8f16(ptr %0, <vscale x 8 x half> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8f16:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.h
+; CHECK-VLA-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv8f16:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str q0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 8 x half> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 4 x float> @ld_nxv4f32(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4f32:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.s
+; CHECK-VLA-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv4f32:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr q0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 4 x float>, ptr %0, align 16
+ ret <vscale x 4 x float> %2
+}
+
+define void @st_nxv4f32(ptr %0, <vscale x 4 x float> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4f32:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.s
+; CHECK-VLA-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv4f32:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str q0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 4 x float> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 2 x double> @ld_nxv2f64(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2f64:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.d
+; CHECK-VLA-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv2f64:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr q0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 2 x double>, ptr %0, align 16
+ ret <vscale x 2 x double> %2
+}
+
+define void @st_nxv2f64(ptr %0, <vscale x 2 x double> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2f64:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.d
+; CHECK-VLA-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv2f64:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str q0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 2 x double> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 16 x i8> @ld_nxv16i8_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv16i8_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.b
+; CHECK-VLA-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv16i8_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: ldr q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 16 x i8>, ptr %4, align 16
+ ret <vscale x 16 x i8> %5
+}
+
+define void @st_nxv16i8_offset(ptr %0, <vscale x 16 x i8> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv16i8_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.b
+; CHECK-VLA-NEXT: st1b { z0.b }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv16i8_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: str q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 16 x i8> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 8 x i16> @ld_nxv8i16_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8i16_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.h
+; CHECK-VLA-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv8i16_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: ldr q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 8 x i16>, ptr %4, align 16
+ ret <vscale x 8 x i16> %5
+}
+
+define void @st_nxv8i16_offset(ptr %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8i16_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.h
+; CHECK-VLA-NEXT: st1h { z0.h }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv8i16_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: str q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 8 x i16> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 4 x i32> @ld_nxv4i32_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4i32_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.s
+; CHECK-VLA-NEXT: ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv4i32_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: ldr q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 4 x i32>, ptr %4, align 16
+ ret <vscale x 4 x i32> %5
+}
+
+define void @st_nxv4i32_offset(ptr %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4i32_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.s
+; CHECK-VLA-NEXT: st1w { z0.s }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv4i32_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: str q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 4 x i32> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 2 x i64> @ld_nxv2i64_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2i64_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.d
+; CHECK-VLA-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv2i64_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: ldr q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 2 x i64>, ptr %4, align 16
+ ret <vscale x 2 x i64> %5
+}
+
+define void @st_nxv2i64_offset(ptr %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2i64_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.d
+; CHECK-VLA-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv2i64_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: str q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 2 x i64> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 8 x half> @ld_nxv8f16_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8f16_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.h
+; CHECK-VLA-NEXT: ld1h { z0.h }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv8f16_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: ldr q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 8 x half>, ptr %4, align 16
+ ret <vscale x 8 x half> %5
+}
+
+define void @st_nxv8f16_offset(ptr %0, <vscale x 8 x half> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8f16_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.h
+; CHECK-VLA-NEXT: st1h { z0.h }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv8f16_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: str q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 8 x half> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 4 x float> @ld_nxv4f32_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4f32_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.s
+; CHECK-VLA-NEXT: ld1w { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv4f32_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: ldr q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 4 x float>, ptr %4, align 16
+ ret <vscale x 4 x float> %5
+}
+
+define void @st_nxv4f32_offset(ptr %0, <vscale x 4 x float> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4f32_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.s
+; CHECK-VLA-NEXT: st1w { z0.s }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv4f32_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: str q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 4 x float> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 2 x double> @ld_nxv2f64_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2f64_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.d
+; CHECK-VLA-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv2f64_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: ldr q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 2 x double>, ptr %4, align 16
+ ret <vscale x 2 x double> %5
+}
+
+define void @st_nxv2f64_offset(ptr %0, <vscale x 2 x double> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2f64_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ptrue p0.d
+; CHECK-VLA-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv2f64_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: rdvl x8, #1
+; CHECK-128-NEXT: str q0, [x0, x8]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 2 x double> %1, ptr %5, align 16
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
|
@@ -237,8 +223,8 @@ define <vscale x 16 x i8> @ld_nxv16i8_offset(ptr %0) #0 { | |||
; | |||
; CHECK-128-LABEL: ld_nxv16i8_offset: | |||
; CHECK-128: // %bb.0: | |||
; CHECK-128-NEXT: ptrue p0.b | |||
; CHECK-128-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] | |||
; CHECK-128-NEXT: rdvl x8, #1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The lowering for the *_offset
versions could be improved, but if the equivalent ACLE code was compiled directly we wouldn't have vscale
in the IR, and instead would generate:
ldr q0, [x0, #16]
and
str q0, [x0, #16]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Whilst not unreasonable this PR makes me slightly uneasy because we have considerable code within the code generator that goes the other way (i.e. from fixed-length to scalable vector) for:
- wider than NEON fixed length vectors.
- operations unsupported by NEON
- most all fixed length operations when in StreamingSVE mode.
I worry how far this could travel because if it's worth removing the predicate for the loads and stores then presumably the same is true for many other instructions and then we could end up in a situation where legalisation and combines are conflicting.
I have the following thoughts/questions:
- Where do the scalable vector operations come from?
- Is this because the cost model has chosen scalable auto-vectorisation when fixed length would have been better?
- I'd rather not complicate code generation just because LoopVectorize has made the wrong call.
- Is this scalable vector ACLE code that wants to benefit from knowing the exact vector length?
- Is this because the cost model has chosen scalable auto-vectorisation when fixed length would have been better?
- If the predicate is the main concern, does emitting the SVE fill/spill instructions improve performance?
- Which could be achieved during isel.
- If this works then perhaps AArch64LoadStoreOpt could be taught to pair SVE spill/fill instructions when the vector length is known to be 128-bit?
- If this is only the start of taking advantage of knowing the exact vector length then would it be better to have these transformations as a dedicated IR pass? Then the other optimisers can improve things further and the code generator should just work.
As I say, I'm not against the PR but it would be good to understand the direction of travel early to prevent tying selection in knots.
static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG, | ||
const AArch64Subtarget *Subtarget) { | ||
EVT MemVT = LD->getMemoryVT(); | ||
if (!Subtarget->isNeonAvailable() || !MemVT.isScalableVector() || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think hasNEON
should be enough here because the NEON load/store instruction remain available when in StreamingMode.
Should the combine be restricted to little endian only?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I think you're right, I've changed isNeonAvailable
to hasNEON
.
It should indeed be restricted to little-endian only, but I believe this is already enforced in performLOADCombine
(see https://github.com/llvm/llvm-project/blob/90802ec069a88ebce7de370658093ed919480fb8/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp#L23609). Would you rather have a check here in any case?
return SDValue(); | ||
|
||
// Skip unpacked types given their different layouts between Neon and SVE. | ||
if (MemVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As above, I don't see AArch64::SVEBitsPerBlock
being relevant. You really means 128
.
It looks like you're only considering the memory VT and ignoring the load's result type. This could be an extending load, which presumably you'll want to ignore as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, done as well.
You're right, thank you, I've added a check to ignore extending loads.
@@ -0,0 +1,483 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc -aarch64-sve-vector-bits-max=0 < %s | FileCheck %s --check-prefix=CHECK-VLA |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the general SVE code generation should already be well tested. Does the -aarch64-sve-vector-bits-max=0
RUN line offer any new value?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I was initially planning to use it to check the general SVE folds to LDR/STR, but you're right, there's already enough coverage for those. I've removed the -aarch64-sve-vector-bits-max=0
RUN line and cleaned up the tests a bit.
Hi @paulwalker-arm, thanks very much for your feedback, I'll address the code changes but first I thought I could reply to your main comments.
Yes, it's scalable ACLE code compiled with
I think this should work too. I have a patch that adds patterns for emitting SVE LDR/STR instead of PTRUE LD1/ST1. I can put it up for review if you'd like to try going this route instead. What do you think?
Whilst having a dedicated pass could be useful, we are not aiming to pursue this "generally" for all instructions. We care about going from scalable to fixed-length vectors for loads and stores mainly to benefit from LDP/STP folds. For example, given (https://godbolt.org/z/orbeMTon3): #include <arm_sve.h>
svfloat64_t foo(const double *x) {
svbool_t pg = svptrue_b64();
return svld1_f64(pg, x) + svld1_f64(pg, x+svcntd());
} We currently generate: foo:
ptrue p0.d
mov x8, #2
ld1d { z0.d }, p0/z, [x0]
ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
fadd z0.d, z0.d, z1.d
ret With this patch, we would instead have: foo:
ldp q0, q1, [x0]
fadd v0.2d, v0.2d, v1.2d
ret Please let me know what you think. I'm happy to try the route via AArch64LoadStoreOpt you suggested if you think that's a better strategy! |
@rj-jesus - From the output I see a NEON If you're happy to limit the output changes to just the loads and stores then experimenting with emitting SVE fill/spill instruction and extending AArch64LoadStoreOpt would be nice because it could also optimise real spill/fill code. This would be especially beneficial across function calls. Otherwise, given the expected reach as you've described it, I'm happy enough for you to continue down the existing path assuming others agree. However, given this specifically relates to the ACLE, can the combine be restricted to pre type-legalisation? I think that'll cover your use case whilst removing the potential conflicts I'm worried about. |
I think the NEON FADD isn't critical to have - I'd be happy with an SVE FADD too, ideally unpredicated, which I do see emitted with the separate patch for SVE LDR/STR.
That sounds good, in that case I'll put up a separate patch for the SVE LDR/STR case and separately experiment with the AArch64LoadStoreOpt alternative. In the meantime I'll also address your comments here, I just haven't had the chance to do so today (I'm sorry).
I think that should work, in fact I believe I tried that in an earlier version of the patch and I seem to recall it working. |
Hi @paulwalker-arm, thanks again for your feedback. I should have addressed most of it now. I'm still keen to try the route you suggested via AArch64LoadStoreOpt once #127837 gets sorted (assuming you still think that's the preferable approach). Please let me know if you have any other comments or suggestions. |
Hi @paulwalker-arm, it seems that doing this in AArch64LoadStoreOpt won't be a walk in the park as by the time we reach the pass, when compiling with |
I see what you mean, the flaw being the instructions don't support a consistent set of addressing modes? Fair enough, I think for today that means we're better keeping things simple rather than complicating That just leaves the choice over whether to have this in IR or CodeGen. Have you looked into doing this as an InstCombine? I'm hoping doing it in IR will be simpler because there's less corner cases to worry about. |
Exactly, this seems to be a (likely unintended) side-effect of compiling for VLS SVE. When compiling for VLS SVE, VL-based offsets are folded to constants early on, which then leads to a mismatch with the addressing modes of SVE loads/stores given they generally expect offsets as multiples of VL (example). I think it would be worth "uplifting" these offsets back into multiples of VL, such that they could be folded into immediate-VL-based loads/stores, though I'm not sure where would be the best place to do this. Do you think this is worth trying, probably independently from this PR?
I haven't tried this yet as I wanted to limit this to the cases we knew we could fold to Neon LDR/STR to promote LDP/STP merging, though I think we could accomplish the same thing with an extra hook that would inspect the vector type and decide accordingly. Is that what you had in mind? |
Oh sorry, for some reason I had it in head that the source of the loads/store would all be SVE intrinsics and thus you could extend |
I've thought the same but it always felt tricky to pull off properly. The last time I looked at it I wondered whether it could be handling entirely during isel via complex patterns. I hesitated because I worried it could cause significant noise and make it harder for machine passes that attempt to analyse/merge offsets. This could be ill founded but with no performance based motivation I figured I'll just ignore it for now. Feel free to investigate this yourself but for my money I think you'll be better off just pushing this PR through and moving on until analysis shows a genuine need to look deeper. I'll caveat this by saying despite implementing the feature the whole |
Thanks for the suggestion, I'll ask around if there are known cases where missing out on the addressing modes is causing problems. In any case, although that wasn't the main goal, the current patch should also help fix the addressing mode issues for VLS 128. For what it's worth, I'm not the biggest fan of |
dd74540
to
8a8fbaa
Compare
I've rebased this to see if it gets through the Windows test that was seemingly failing. In the meantime, I think we could also implement this as a few new patterns in |
Definitely not. Unless absolutely necessary, let's keep SVE VLS separate from isel. Sorry for the sporadic reviews, I will take another look at the PR tomorrow. |
Sounds good, thank you for the quick sanity check! |
@@ -0,0 +1,129 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128 | |||
; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | not grep -e ldr -e str |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know there's an existing bug with big endian for SVE ldr/str that I'm happy to ignore for this PR but I do think it's important to check the output for this test if only to ensure we don't erroneously emit NEON ldr/str instructions.
Given you call out ldp/stp as being a core benefit to this transformation, perhaps it's worth adding dedicated tests to verify it happens?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I've changed the BE run line to match the full output.
I've also added a simple test for LDP/STP. Please let me know if you had more complex tests in mind.
TargetLowering::DAGCombinerInfo &DCI, | ||
const AArch64Subtarget *Subtarget) { | ||
EVT MemVT = LD->getMemoryVT(); | ||
if (!DCI.isBeforeLegalize() || !Subtarget->hasNEON() || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is missing !Subtarget->isLittleEndian()
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe this was already checked in performLOADCombine()
, but in any case I've added the explicit check here.
…ctor-bits=128. Given the code below: ```cpp svuint8_t foo(uint8_t *x) { return svld1(svptrue_b8(), x); } ``` When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we currently generate: ```gas foo: ptrue p0.b ld1b { z0.b }, p0/z, [x0] ret ``` Whereas (on little-endian) we could instead be using LDR as follows: ```gas foo: ldr q0, [x0] ret ``` Besides avoiding the predicate dependency, the above form enables further optimisations such as LDP folds. Likewise for stores.
8a8fbaa
to
70568c9
Compare
Hi @paulwalker-arm, I'm very sorry for taking so long to update this. Initially, I was waiting on #129732 to get merged to propose the alternative version that combines SVE LDR/STR into LDP/STP directly, but then that PR took a bit longer than expected and I had to put this on hold. I believe I've addressed most of your feedback. I also have a tentative patch implementing the SVE LDR/STR pairing you initially suggested. Would you prefer I open a separate PR for this, or should I include it in this patch so we can decide on the approach to go for? Thanks very much! |
Please use a separate PR for the SVE LDR/STR pairing approach. Doing this means this PR can remain in reserve if for some reason we decide to reverse the current direction of travel. |
Thanks very much, I've just opened #134068 to attempt the SVE fill/spill pairing route. |
…s=128. (#134068) When compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when the offsets allow it, we can pair SVE LDR/STR instructions into Neon LDP/STP. For example, given: ```cpp #include <arm_sve.h> void foo(double const *ldp, double *stp) { svbool_t pg = svptrue_b64(); svfloat64_t ld1 = svld1_f64(pg, ldp); svfloat64_t ld2 = svld1_f64(pg, ldp+svcntd()); svst1_f64(pg, stp, ld1); svst1_f64(pg, stp+svcntd(), ld2); } ``` When compiled with `-msve-vector-bits=128`, we currently generate: ```gas foo: ldr z0, [x0] ldr z1, [x0, #1, mul vl] str z0, [x1] str z1, [x1, #1, mul vl] ret ``` With this patch, we instead generate: ```gas foo: ldp q0, q1, [x0] stp q0, q1, [x1] ret ``` This is an alternative, more targetted approach to #127500.
…s=128. (llvm#134068) When compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when the offsets allow it, we can pair SVE LDR/STR instructions into Neon LDP/STP. For example, given: ```cpp #include <arm_sve.h> void foo(double const *ldp, double *stp) { svbool_t pg = svptrue_b64(); svfloat64_t ld1 = svld1_f64(pg, ldp); svfloat64_t ld2 = svld1_f64(pg, ldp+svcntd()); svst1_f64(pg, stp, ld1); svst1_f64(pg, stp+svcntd(), ld2); } ``` When compiled with `-msve-vector-bits=128`, we currently generate: ```gas foo: ldr z0, [x0] ldr z1, [x0, llvm#1, mul vl] str z0, [x1] str z1, [x1, llvm#1, mul vl] ret ``` With this patch, we instead generate: ```gas foo: ldp q0, q1, [x0] stp q0, q1, [x1] ret ``` This is an alternative, more targetted approach to llvm#127500.
Given the code below:
When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we
currently generate:
Whereas (on little-endian) we could instead be using LDR as follows:
Besides avoiding the predicate dependency, the above form enables
further optimisations such as LDP folds. Likewise for other types and
stores.
I have a patch that enables similar folds for SVE LDR/STR, but since that
causes a fair number of test changes I rather open a separate PR for it.