@@ -169,6 +169,64 @@ class SiFive7GetOrderedReductionCycles<string mx, int sew, int VLEN> {
169169 int c = !mul(6, VLUpperBound);
170170}
171171
172+ class isSingleDLEN<string mx> {
173+ bit c = !or(!eq(mx, "MF2"), !or(!eq(mx, "MF4"), !eq(mx, "MF8")));
174+ }
175+
176+ class SiFive7GetCyclesVRGatherVV<string mx, int sew, int VLEN,
177+ bit hasFastGather> {
178+ // if (hasFastGather && isSingleDLEN(mx))
179+ // c = 1;
180+ // else if (hasFastGather && (log2(SEW/8) + log2(LMUL) <= log2(DLEN / 32))
181+ // c = LMUL * 2 * ceil(vl * SEW / DLEN);
182+ // else
183+ // c = vl;
184+
185+ defvar y = !logtwo(!div(sew, 8));
186+ defvar x = !cond(
187+ !eq(mx, "M1") : y,
188+ !eq(mx, "M2") : !add(y, 1),
189+ !eq(mx, "M4") : !add(y, 2),
190+ !eq(mx, "M8") : !add(y, 3),
191+ // Give isSingleDLEN(mx) cases a garbage value to avoid build failures,
192+ // even though x will go unused.
193+ true : 1
194+ );
195+ // LMUL * 2 * ceil(vl * SEW / DLEN) = LMUL * 2 * ceil(2 * LMUL)
196+ defvar z = !cond(
197+ !eq(mx, "M1") : 4,
198+ !eq(mx, "M2") : 16,
199+ !eq(mx, "M4") : 64,
200+ !eq(mx, "M8") : 256,
201+ // Give isSingleDLEN(mx) cases a garbage value to avoid build failures,
202+ // even though z will go unused.
203+ true : 1
204+ );
205+ defvar VLUpperBound = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
206+ bit IsSingleDLEN = isSingleDLEN<mx>.c;
207+
208+ int c = !cond(
209+ !and(hasFastGather, IsSingleDLEN) : 1,
210+ !and(hasFastGather, !le(x, !logtwo(!div(VLEN, 64)))) : z,
211+ true: VLUpperBound
212+ );
213+ }
214+
215+ class SiFive7GetCyclesVCompress<string mx, int sew, int VLEN,
216+ bit hasFastGather> {
217+
218+ // if (hasFastGather && isSingleDLEN(mx))
219+ // c = 1
220+ // else
221+ // c = vl
222+ defvar VLUpperBound = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
223+ bit IsSingleDLEN = isSingleDLEN<mx>.c;
224+
225+ int c = !if(!and(hasFastGather, IsSingleDLEN),
226+ 1,
227+ VLUpperBound);
228+ }
229+
172230class SiFive7GetSiFiveVFNRClipCycles<string mx, int VLEN> {
173231 int latency = !cond(
174232 !eq(mx, "MF8"): 7,
@@ -259,7 +317,8 @@ multiclass SiFive7WriteResBase<int VLEN,
259317 ProcResourceKind VL, ProcResourceKind VS,
260318 ProcResourceKind VCQ,
261319 SiFive7FPLatencies fpLatencies,
262- bit isFP64Throttled = false> {
320+ bit isFP64Throttled = false,
321+ bit hasFastGather = false> {
263322
264323 // Branching
265324 let Latency = 3 in {
@@ -976,13 +1035,33 @@ multiclass SiFive7WriteResBase<int VLEN,
9761035
9771036 foreach mx = SchedMxList in {
9781037 foreach sew = SchedSEWSet<mx>.val in {
979- defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
9801038 defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
981- let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
982- defm : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>;
983- defm : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>;
984- defm : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>;
985- }
1039+ defvar IsSingleDLEN = isSingleDLEN<mx>.c;
1040+
1041+ defvar GatherVVCycles =
1042+ SiFive7GetCyclesVRGatherVV<mx, sew, VLEN, hasFastGather>.c;
1043+ // 7 + DLEN/ SEW
1044+ defvar SlowGatherLat = !add(7, !div(!div(VLEN, 2), sew));
1045+ defvar GatherVVLat = !if(hasFastGather,
1046+ !add(3, GatherVVCycles), SlowGatherLat);
1047+
1048+ let Latency = GatherVVLat, AcquireAtCycles = [0, 1],
1049+ ReleaseAtCycles = [1, !add(5, GatherVVCycles)] in
1050+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>;
1051+
1052+ // VRGatherEI16VV is not improved by fastGather.
1053+ defvar GatherEI16VVCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
1054+ let Latency = SlowGatherLat, AcquireAtCycles = [0, 1],
1055+ ReleaseAtCycles = [1, !add(5, GatherEI16VVCycles)] in
1056+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>;
1057+
1058+ defvar CompressCycles = SiFive7GetCyclesVCompress<mx, sew, VLEN, hasFastGather>.c;
1059+ defvar CompressLat = !if(!and(hasFastGather, IsSingleDLEN),
1060+ 4,
1061+ !add(7, CompressCycles)); // 7 + VL
1062+ let Latency = CompressLat, AcquireAtCycles = [0, 1],
1063+ ReleaseAtCycles = [1, !add(8, CompressCycles)] in
1064+ defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>;
9861065 }
9871066 }
9881067
@@ -1408,7 +1487,8 @@ multiclass SiFive7ReadAdvance {
14081487/// eventually be supplied by different SchedMachineModels.
14091488multiclass SiFive7SchedResources<int vlen, bit extraVALU,
14101489 SiFive7FPLatencies fpLatencies,
1411- bit isFP64Throttled> {
1490+ bit isFP64Throttled,
1491+ bit hasFastGather> {
14121492 defm SiFive7 : SiFive7ProcResources<extraVALU>;
14131493
14141494 // Pull out defs from SiFive7ProcResources so we can refer to them by name.
@@ -1435,7 +1515,8 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
14351515 : SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
14361516 SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
14371517 SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
1438- SiFive7VCQ, fpLatencies, isFP64Throttled>;
1518+ SiFive7VCQ, fpLatencies, isFP64Throttled,
1519+ hasFastGather>;
14391520
14401521 //===----------------------------------------------------------------------===//
14411522 // Bypass and advance
@@ -1468,6 +1549,7 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
14681549
14691550 SiFive7FPLatencies FPLatencies;
14701551 bit IsFP64Throttled = false;
1552+ bit HasFastGather = false;
14711553
14721554 string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
14731555}
@@ -1494,14 +1576,16 @@ def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
14941576 let HasExtraVALU = true;
14951577 let FPLatencies = SiFive7LowFPLatencies;
14961578 let IsFP64Throttled = true;
1579+ let HasFastGather = true;
14971580}
14981581
14991582/// Binding models to their scheduling resources.
15001583foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
15011584 let SchedModel = model in
15021585 defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
15031586 model.FPLatencies,
1504- model.IsFP64Throttled>;
1587+ model.IsFP64Throttled,
1588+ model.HasFastGather>;
15051589}
15061590
15071591// Some model name aliases.
0 commit comments