Skip to content

Commit

Permalink
[RISCV] Add +optimized-nfN-segment-load-store (#114414)
Browse files Browse the repository at this point in the history
This is a follow up to #111511, where after benchmarking we learnt that
the Banana Pi F3 has fast segmented loads for not just NF=2, but also
NF=3 and NF=4:
https://github.com/preames/bp3-microarch#vlseg_lmul_x_sew_throughput

This adds tuning features to allow these segment loads and stores to be
costed cheaper and enables it for the spacemit-x60.

It also enables +optimized-nf2-segment-load-store by default in the
generic tuning to maintain the previous behaviour when compiled without
-mcpu or -mtune.
  • Loading branch information
lukel97 authored Nov 3, 2024
1 parent 6bad451 commit beb12f9
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 87 deletions.
7 changes: 7 additions & 0 deletions llvm/lib/Target/RISCV/RISCVFeatures.td
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,13 @@ def TuneOptimizedZeroStrideLoad
"true", "Optimized (perform fewer memory operations)"
"zero-stride vector load">;

foreach nf = {2-8} in
def TuneOptimizedNF#nf#SegmentLoadStore :
SubtargetFeature<"optimized-nf"#nf#"-segment-load-store",
"HasOptimizedNF"#nf#"SegmentLoadStore",
"true", "vlseg"#nf#"eN.v and vsseg"#nf#"eN.v are"
"implemented as a wide memory op and shuffle">;

def Experimental
: SubtargetFeature<"experimental", "HasExperimental",
"true", "Experimental intrinsics">;
Expand Down
16 changes: 12 additions & 4 deletions llvm/lib/Target/RISCV/RISCVProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,19 @@ class RISCVTuneProcessorModel<string n,
list<SubtargetFeature> f = []>
: ProcessorModel<n, m, f,tunef>;

defvar GenericTuneFeatures = [TuneOptimizedNF2SegmentLoadStore];

def GENERIC_RV32 : RISCVProcessorModel<"generic-rv32",
NoSchedModel,
[Feature32Bit,
FeatureStdExtI]>,
FeatureStdExtI],
GenericTuneFeatures>,
GenericTuneInfo;
def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
NoSchedModel,
[Feature64Bit,
FeatureStdExtI]>,
FeatureStdExtI],
GenericTuneFeatures>,
GenericTuneInfo;
// Support generic for compatibility with other targets. The triple will be used
// to change to the appropriate rv32/rv64 version.
Expand Down Expand Up @@ -221,7 +225,8 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",

defvar SiFiveX280TuneFeatures = !listconcat(SiFive7TuneFeatures,
[TuneDLenFactor2,
TuneOptimizedZeroStrideLoad]);
TuneOptimizedZeroStrideLoad,
TuneOptimizedNF2SegmentLoadStore]);
def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
[Feature64Bit,
FeatureStdExtI,
Expand Down Expand Up @@ -472,7 +477,10 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
FeatureStdExtZvfh,
FeatureStdExtZvkt,
FeatureStdExtZvl256b]),
[TuneDLenFactor2]>;
[TuneDLenFactor2,
TuneOptimizedNF2SegmentLoadStore,
TuneOptimizedNF3SegmentLoadStore,
TuneOptimizedNF4SegmentLoadStore]>;

def RP2350_HAZARD3 : RISCVProcessorModel<"rp2350-hazard3",
NoSchedModel,
Expand Down
28 changes: 25 additions & 3 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,28 @@ RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
}

static bool hasOptimizedSegmentLoadStore(unsigned NF,
const RISCVSubtarget *ST) {
switch (NF) {
case 2:
return ST->hasOptimizedNF2SegmentLoadStore();
case 3:
return ST->hasOptimizedNF3SegmentLoadStore();
case 4:
return ST->hasOptimizedNF4SegmentLoadStore();
case 5:
return ST->hasOptimizedNF5SegmentLoadStore();
case 6:
return ST->hasOptimizedNF6SegmentLoadStore();
case 7:
return ST->hasOptimizedNF7SegmentLoadStore();
case 8:
return ST->hasOptimizedNF8SegmentLoadStore();
default:
llvm_unreachable("Unexpected NF");
}
}

InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
Expand All @@ -737,9 +759,9 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
AddressSpace, DL)) {

// Most available hardware today optimizes NF=2 as as one wide memory op
// + Factor * LMUL shuffle ops.
if (Factor == 2) {
// Some processors optimize segment loads/stores as one wide memory op +
// Factor * LMUL shuffle ops.
if (hasOptimizedSegmentLoadStore(Factor, ST)) {
InstructionCost Cost =
getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
Expand Down
Loading

0 comments on commit beb12f9

Please sign in to comment.