From 6e51ceba536d88f882737c9c4f9ff0ffb0004bfd Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 3 Dec 2019 11:25:07 +0000 Subject: [PATCH] [AArch64][SVE] Add intrinsics for gather loads with 64-bit offsets This patch adds the following intrinsics for gather loads with 64-bit offsets: * @llvm.aarch64.sve.ld1.gather (unscaled offset) * @llvm.aarch64.sve.ld1.gather.index (scaled offset) These intrinsics map 1-1 to the following AArch64 instructions respectively (examples for half-words): * ld1h { z0.d }, p0/z, [x0, z0.d] * ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] Committing on behalf of Andrzej Warzynski (andwar) Reviewers: sdesmalen, huntergr, rovka, mgudim, dancgr, rengolin, efriedma Reviewed By: efriedma Tags: #llvm Differential Revision: https://reviews.llvm.org/D70542 --- llvm/include/llvm/IR/IntrinsicsAArch64.td | 19 +++++ .../Target/AArch64/AArch64ISelLowering.cpp | 85 +++++++++++++++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 + .../lib/Target/AArch64/AArch64InstrFormats.td | 19 ++++- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 56 ++++++------ llvm/lib/Target/AArch64/SVEInstrFormats.td | 12 ++- .../Target/AArch64/Utils/AArch64BaseInfo.h | 11 +++ ...insics-gather-loads-64bit-scaled-offset.ll | 59 +++++++++++++ ...sics-gather-loads-64bit-unscaled-offset.ll | 74 ++++++++++++++++ 9 files changed, 310 insertions(+), 29 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll create mode 100644 llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 72bc4a2aa216dd..d4ed3d7b8ec5b3 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -942,6 +942,15 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". llvm_i32_ty], [IntrNoMem]>; +class AdvSIMD_GatherLoad_64bitOffset_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [ + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i64_ty> + ], + [IntrReadMem, IntrArgMemOnly]>; + // This class of intrinsics are not intended to be useful within LLVM IR but // are instead here to support some of the more regid parts of the ACLE. class Builtin_SVCVT @@ -1172,4 +1181,14 @@ def int_aarch64_sve_ucvtf_f64i32 : Builtin_SVCVT<"svcvt_f64_u32_m", llvm_nxv2 def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic; def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic; + +// +// Gather loads: +// + +// scalar + vector, 64 bit unscaled offsets +def int_aarch64_sve_ld1_gather : AdvSIMD_GatherLoad_64bitOffset_Intrinsic; + +// scalar + vector, 64 bit scaled offsets +def int_aarch64_sve_ld1_gather_index : AdvSIMD_GatherLoad_64bitOffset_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 461d781effb85b..0f6c2c5a628dcb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1336,6 +1336,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; case AArch64ISD::INSR: return "AArch64ISD::INSR"; + case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; + case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; } return nullptr; } @@ -11760,6 +11762,85 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(MinOffset, DL, MVT::i64)); } +// Returns an SVE type that ContentTy can be trivially sign or zero extended +// into. +static MVT getSVEContainerType(EVT ContentTy) { + assert(ContentTy.isSimple() && "No SVE containers for extended types"); + + switch (ContentTy.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("No known SVE container for this MVT type"); + case MVT::nxv2i8: + case MVT::nxv2i16: + case MVT::nxv2i32: + case MVT::nxv2i64: + case MVT::nxv2f32: + case MVT::nxv2f64: + return MVT::nxv2i64; + case MVT::nxv4i8: + case MVT::nxv4i16: + case MVT::nxv4i32: + case MVT::nxv4f32: + return MVT::nxv4i32; + } +} + +static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode) { + EVT RetVT = N->getValueType(0); + assert(RetVT.isScalableVector() && + "Gather loads are only possible for SVE vectors"); + + SDLoc DL(N); + MVT RetElVT = RetVT.getVectorElementType().getSimpleVT(); + unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits(); + + EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements); + if (RetVT.getSizeInBits().getKnownMinSize() > + MaxVT.getSizeInBits().getKnownMinSize()) + return SDValue(); + + // Depending on the addressing mode, this is either a pointer or a vector of + // pointers (that fits into one register) + const SDValue Base = N->getOperand(3); + // Depending on the addressing mode, this is either a single offset or a + // vector of offsets (that fits into one register) + const SDValue Offset = N->getOperand(4); + + if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) || + !DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType())) + return SDValue(); + + // Return value type that is representable in hardware + EVT HwRetVt = getSVEContainerType(RetVT); + + // Keep the original output value type around - this will better inform + // optimisations (e.g. instruction folding when load is followed by + // zext/sext). This will only be used for ints, so the value for FPs + // doesn't matter. + SDValue OutVT = DAG.getValueType(RetVT); + if (RetVT.isFloatingPoint()) + OutVT = DAG.getValueType(HwRetVt); + + SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other); + SDValue Ops[] = {N->getOperand(0), // Chain + N->getOperand(2), // Pg + Base, Offset, OutVT}; + + SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (RetVT.isInteger() && (RetVT != HwRetVt)) + Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0)); + + // If the original return value was FP, bitcast accordingly. Doing it here + // means that we can avoid adding TableGen patterns for FPs. + if (RetVT.isFloatingPoint()) + Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0)); + + return DAG.getMergeValues({Load, LoadChain}, DL); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -11846,6 +11927,10 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); + case Intrinsic::aarch64_sve_ld1_gather: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1); + case Intrinsic::aarch64_sve_ld1_gather_index: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED); default: break; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 180dd50dc39681..52bb0f25aa9a17 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -198,6 +198,10 @@ enum NodeType : unsigned { INSR, + // Unsigned gather loads. + GLD1, + GLD1_SCALED, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 878cb79eb32673..fee825422ca4f9 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -358,6 +358,16 @@ def am_indexed7s128 : ComplexPattern; def am_indexedu6s128 : ComplexPattern; def am_indexeds9s128 : ComplexPattern; +def UImmS2XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def UImmS4XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def UImmS8XForm : SDNodeXFormgetTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64); +}]>; + // uimm5sN predicate - True if the immediate is a multiple of N in the range // [0 * N, 32 * N]. def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>; @@ -365,17 +375,20 @@ def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>; def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>; def uimm5s2 : Operand, ImmLeaf= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }], + UImmS2XForm> { let ParserMatchClass = UImm5s2Operand; let PrintMethod = "printImmScale<2>"; } def uimm5s4 : Operand, ImmLeaf= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }], + UImmS4XForm> { let ParserMatchClass = UImm5s4Operand; let PrintMethod = "printImmScale<4>"; } def uimm5s8 : Operand, ImmLeaf= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }], + UImmS8XForm> { let ParserMatchClass = UImm5s8Operand; let PrintMethod = "printImmScale<8>"; } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 43e5ac058885c1..575adeebc5955d 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -10,6 +10,14 @@ // //===----------------------------------------------------------------------===// +def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + let Predicates = [HasSVE] in { def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">; @@ -454,33 +462,33 @@ let Predicates = [HasSVE] in { // Gathers using unscaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d] - defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">; - defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">; - defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">; - defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">; - defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">; - defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">; - defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">; - defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">; - defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">; - defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">; - defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">; - defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">; - defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">; - defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">; + defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", null_frag, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", null_frag, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", null_frag, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>; // Gathers using scaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d, lsl #1] - defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>; - defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>; - defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>; - defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>; - defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>; - defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>; + defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", null_frag, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", null_frag, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>; // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw] diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 9169e463c66276..68329a2a2e4f8c 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5584,18 +5584,26 @@ multiclass sve_mem_64b_gld_vs_32_unscaled opc, string asm, } multiclass sve_mem_64b_gld_sv2_64_scaled opc, string asm, - RegisterOperand zprext> { + SDPatternOperator op, + RegisterOperand zprext, ValueType vt> { def _SCALED_REAL : sve_mem_64b_gld_sv; def : InstAlias(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), + (!cast(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } -multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm> { +multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm, + SDPatternOperator op, ValueType vt> { def _REAL : sve_mem_64b_gld_sv; def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), + (!cast(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_64b_gld_vi opc, string asm, Operand imm_ty> diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 7a4fcac09ec4d8..57c126fe6494bf 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -643,6 +643,17 @@ namespace AArch64II { }; } // end namespace AArch64II +namespace AArch64 { +// The number of bits in a SVE register is architecturally defined +// to be a multiple of this value. If has this number of bits, +// a vector can be stored in a SVE register without any +// redundant bits. If has this number of bits divided by P, +// a vector is stored in a SVE register by placing index i +// in index i*P of a vector. The other elements of the +// vector (such as index 1) are undefined. +static constexpr unsigned SVEBitsPerBlock = 128; +} // end namespace AArch64 + } // end namespace llvm #endif diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll new file mode 100644 index 00000000000000..274eaad0eef1da --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll @@ -0,0 +1,59 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LD1H, LD1W, LD1D: base + 64-bit scaled offset +; e.g. ld1h z0.d, p0/z, [x0, z0.d, lsl #1] +; + +define @gld1h_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_index +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] +; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gld1w_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1w_index +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gld1d_index( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_index +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gld1d_index_double( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_index_double +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +declare @llvm.aarch64.sve.ld1.gather.index.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ld1.gather.index.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ld1.gather.index.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ld1.gather.index.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll new file mode 100644 index 00000000000000..9a8df453b336f7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LD1B, LD1W, LD1H, LD1D: base + 64-bit unscaled offset +; e.g. ld1h { z0.d }, p0/z, [x0, z0.d] +; + +define @gld1b_d( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1b_d: +; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: mov w8, #255 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +define @gld1h_d( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_d: +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gld1w_d( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1w_d: +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: mov w8, #-1 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %offsets) + %res = zext %load to + ret %res +} + +define @gld1d_d( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_d: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gld1d_d_double( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_d_double: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ld1.gather.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +declare @llvm.aarch64.sve.ld1.gather.nxv2i8(, i8*, ) +declare @llvm.aarch64.sve.ld1.gather.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ld1.gather.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ld1.gather.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ld1.gather.nxv2f64(, double*, )