Skip to content

Commit

Permalink
[AArch64] Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
Browse files Browse the repository at this point in the history
Improve codegen for (trunc X to <3 x i8>) by converting it to a sequence
of 3 ST1.b, but first converting the truncate operand to either v8i8 or
v16i8, extracting the lanes for the truncate results and storing them.

At the moment, there are almost no cases in which such vector operations
will be generated automatically. The motivating case is non-power-of-2
SLP vectorization: llvm#77790
  • Loading branch information
fhahn committed Jan 18, 2024
1 parent 8336515 commit efd07e9
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 21 deletions.
50 changes: 50 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21318,6 +21318,53 @@ bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
}

// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
SDValue Value = ST->getValue();
EVT ValueVT = Value.getValueType();

if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
ST->getOriginalAlign() >= 4 || Value.getOpcode() != ISD::TRUNCATE ||
ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
return SDValue();

SDLoc DL(ST);
auto WideVT = EVT::getVectorVT(
*DAG.getContext(),
Value->getOperand(0).getValueType().getVectorElementType(), 4);
SDValue UndefVector = DAG.getUNDEF(WideVT);
SDValue WideTrunc = DAG.getNode(
ISD::INSERT_SUBVECTOR, DL, WideVT,
{UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
SDValue Cast = DAG.getNode(
ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
WideTrunc);

SDValue Chain = ST->getChain();
SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
DAG.getConstant(8, DL, MVT::i64));

SDValue Ptr2 =
DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(2), DL);
Chain = DAG.getStore(Chain, DL, E2, Ptr2, ST->getPointerInfo(),
ST->getOriginalAlign());

SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
DAG.getConstant(4, DL, MVT::i64));

SDValue Ptr1 =
DAG.getMemBasePlusOffset(ST->getBasePtr(), TypeSize::getFixed(1), DL);
Chain = DAG.getStore(Chain, DL, E1, Ptr1, ST->getPointerInfo(),
ST->getOriginalAlign());
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
DAG.getConstant(0, DL, MVT::i64));
Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(), ST->getPointerInfo(),
ST->getOriginalAlign());

return Chain;
}

static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
Expand All @@ -21333,6 +21380,9 @@ static SDValue performSTORECombine(SDNode *N,
return EltVT == MVT::f32 || EltVT == MVT::f64;
};

if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
return Res;

// If this is an FP_ROUND followed by a store, fold this into a truncating
// store. We can do this even if this is already a truncstore.
// We purposefully don't care about legality of the nodes here as we know
Expand Down
33 changes: 12 additions & 21 deletions llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
Original file line number Diff line number Diff line change
Expand Up @@ -154,17 +154,12 @@ define <3 x i32> @load_v3i32(ptr %src) {
define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
; CHECK-LABEL: store_trunc_from_64bits:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr s0, [x0]
; CHECK-NEXT: ldrh w8, [x0, #4]
; CHECK-NEXT: mov.h v0[2], w8
; CHECK-NEXT: xtn.8b v0, v0
; CHECK-NEXT: str s0, [sp, #12]
; CHECK-NEXT: ldrh w9, [sp, #12]
; CHECK-NEXT: strb w8, [x1, #2]
; CHECK-NEXT: strh w9, [x1]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: add x8, x0, #4
; CHECK-NEXT: ld1r.4h { v0 }, [x8]
; CHECK-NEXT: ldr w8, [x0]
; CHECK-NEXT: strb w8, [x1]
; CHECK-NEXT: add x8, x1, #1
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: ret
;
; BE-LABEL: store_trunc_from_64bits:
Expand Down Expand Up @@ -236,17 +231,13 @@ entry:
define void @shift_trunc_store(ptr %src, ptr %dst) {
; CHECK-LABEL: shift_trunc_store:
; CHECK: ; %bb.0:
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: shrn.4h v0, v0, #16
; CHECK-NEXT: xtn.8b v1, v0
; CHECK-NEXT: umov.h w8, v0[2]
; CHECK-NEXT: str s1, [sp, #12]
; CHECK-NEXT: ldrh w9, [sp, #12]
; CHECK-NEXT: strb w8, [x1, #2]
; CHECK-NEXT: strh w9, [x1]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: add x8, x1, #1
; CHECK-NEXT: add x9, x1, #2
; CHECK-NEXT: ushr.4s v0, v0, #16
; CHECK-NEXT: st1.b { v0 }[4], [x8]
; CHECK-NEXT: st1.b { v0 }[8], [x9]
; CHECK-NEXT: st1.b { v0 }[0], [x1]
; CHECK-NEXT: ret
;
; BE-LABEL: shift_trunc_store:
Expand Down

0 comments on commit efd07e9

Please sign in to comment.