-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LLVM][ARM] Latency mutations for cortex m55,m7 and m85 #115153
Conversation
This patch adds latency mutations as a scheduling related speedup for the above mentioned cores. When benchmarking this pass on selected benchmarks we see a performance improvement of 1% on most benchmarks with some improving by up to 6%. Change-Id: I621a98dfc8ca95e6f6ea2e163b23f5df1c6a22fc Author: David Penry <david.penry@arm.com> Co-authored-by: Nashe Mncube <nashe.mncube@arm.com
@llvm/pr-subscribers-backend-arm Author: Nashe Mncube (nasherm) ChangesThis patch adds latency mutations as a scheduling related speedup for the above mentioned cores. When benchmarking this pass on selected benchmarks we see a performance improvement of 1% on most benchmarks with some improving by up to 6%. Change-Id: I621a98dfc8ca95e6f6ea2e163b23f5df1c6a22fc Patch is 40.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115153.diff 7 Files Affected:
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index aee9797585dbd2..b6f20e6f99a0a9 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -973,6 +973,34 @@ unsigned getBLXOpcode(const MachineFunction &MF);
unsigned gettBLXrOpcode(const MachineFunction &MF);
unsigned getBLXpredOpcode(const MachineFunction &MF);
+inline bool isMVEVectorInstruction(const MachineInstr *MI) {
+ // This attempts to remove non-mve instructions (scalar shifts), which
+ // are just DPU CX instruction.
+ switch (MI->getOpcode()) {
+ case ARM::MVE_SQSHL:
+ case ARM::MVE_SRSHR:
+ case ARM::MVE_UQSHL:
+ case ARM::MVE_URSHR:
+ case ARM::MVE_SQRSHR:
+ case ARM::MVE_UQRSHL:
+ case ARM::MVE_ASRLr:
+ case ARM::MVE_ASRLi:
+ case ARM::MVE_LSLLr:
+ case ARM::MVE_LSLLi:
+ case ARM::MVE_LSRL:
+ case ARM::MVE_SQRSHRL:
+ case ARM::MVE_SQSHLL:
+ case ARM::MVE_SRSHRL:
+ case ARM::MVE_UQRSHLL:
+ case ARM::MVE_UQSHLL:
+ case ARM::MVE_URSHRL:
+ return false;
+ }
+ const MCInstrDesc &MCID = MI->getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ return (Flags & ARMII::DomainMask) == ARMII::DomainMVE;
+}
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/llvm/lib/Target/ARM/ARMLatencyMutations.cpp b/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
new file mode 100644
index 00000000000000..93676a5892d259
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
@@ -0,0 +1,984 @@
+//===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the ARM definition DAG scheduling mutations which
+/// change inter-instruction latencies
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMLatencyMutations.h"
+#include "ARMSubtarget.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include <algorithm>
+#include <array>
+#include <initializer_list>
+#include <memory>
+#include <utility>
+
+namespace llvm {
+
+namespace {
+
+// Precompute information about opcodes to speed up pass
+
+class InstructionInformation {
+protected:
+ struct IInfo {
+ bool HasBRegAddr : 1; // B-side of addr gen is a register
+ bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
+ bool IsDivide : 1; // Some form of integer divide
+ bool IsInlineShiftALU : 1; // Inline shift+ALU
+ bool IsMultiply : 1; // Some form of integer multiply
+ bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation
+ bool IsNonSubwordLoad : 1; // Load which is a word or larger
+ bool IsShift : 1; // Shift operation
+ bool IsRev : 1; // REV operation
+ bool ProducesQP : 1; // Produces a vector register result
+ bool ProducesDP : 1; // Produces a double-precision register result
+ bool ProducesSP : 1; // Produces a single-precision register result
+ bool ConsumesQP : 1; // Consumes a vector register result
+ bool ConsumesDP : 1; // Consumes a double-precision register result
+ bool ConsumesSP : 1; // Consumes a single-precision register result
+ unsigned MVEIntMACMatched; // Matched operand type (for MVE)
+ unsigned AddressOpMask; // Mask indicating which operands go into AGU
+ IInfo()
+ : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
+ IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
+ IsNonSubwordLoad(false), IsShift(false), IsRev(false),
+ ProducesQP(false), ProducesDP(false), ProducesSP(false),
+ ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
+ MVEIntMACMatched(0), AddressOpMask(0) {}
+ };
+ typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
+ IInfoArray Info;
+
+public:
+ // Always available information
+ unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
+ bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
+ bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
+ bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
+ bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
+ bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
+ bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
+ bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
+ bool isRev(unsigned Op) { return Info[Op].IsRev; }
+ bool isShift(unsigned Op) { return Info[Op].IsShift; }
+
+ // information available if markDPConsumers is called.
+ bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
+ bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
+ bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
+ bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
+ bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
+ bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }
+
+ bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
+ return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
+ }
+
+ InstructionInformation(const ARMBaseInstrInfo *TII);
+
+protected:
+ void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
+};
+
+InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
+ using namespace ARM;
+
+ std::initializer_list<unsigned> hasBRegAddrList = {
+ t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
+ tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr,
+ };
+ for (auto op : hasBRegAddrList) {
+ Info[op].HasBRegAddr = true;
+ }
+
+ std::initializer_list<unsigned> hasBRegAddrShiftList = {
+ t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
+ };
+ for (auto op : hasBRegAddrShiftList) {
+ Info[op].HasBRegAddrShift = true;
+ }
+
+ Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
+
+ std::initializer_list<unsigned> isInlineShiftALUList = {
+ t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs,
+ t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs,
+ t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs,
+ };
+ for (auto op : isInlineShiftALUList) {
+ Info[op].IsInlineShiftALU = true;
+ }
+
+ Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
+
+ std::initializer_list<unsigned> isMultiplyList = {
+ t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX,
+ t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
+ t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX,
+ t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD,
+ t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT,
+ t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL,
+ };
+ for (auto op : isMultiplyList) {
+ Info[op].IsMultiply = true;
+ }
+
+ std::initializer_list<unsigned> isMVEIntMACList = {
+ MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8,
+ MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8,
+ MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8,
+ MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8,
+ MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8,
+ MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
+ MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8,
+ MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8,
+ MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8,
+ MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8,
+ MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8,
+ MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8,
+ MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8,
+ MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8,
+ };
+ for (auto op : isMVEIntMACList) {
+ Info[op].IsMVEIntMAC = true;
+ }
+
+ std::initializer_list<unsigned> isNonSubwordLoadList = {
+ t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci,
+ t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
+ tLDRpci, tLDRr, tLDRspi,
+ };
+ for (auto op : isNonSubwordLoadList) {
+ Info[op].IsNonSubwordLoad = true;
+ }
+
+ std::initializer_list<unsigned> isRevList = {
+ t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
+ };
+ for (auto op : isRevList) {
+ Info[op].IsRev = true;
+ }
+
+ std::initializer_list<unsigned> isShiftList = {
+ t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
+ tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR,
+ };
+ for (auto op : isShiftList) {
+ Info[op].IsShift = true;
+ }
+
+ std::initializer_list<unsigned> Address1List = {
+ t2LDRBi12,
+ t2LDRBi8,
+ t2LDRBpci,
+ t2LDRBs,
+ t2LDRHi12,
+ t2LDRHi8,
+ t2LDRHpci,
+ t2LDRHs,
+ t2LDRSBi12,
+ t2LDRSBi8,
+ t2LDRSBpci,
+ t2LDRSBs,
+ t2LDRSHi12,
+ t2LDRSHi8,
+ t2LDRSHpci,
+ t2LDRSHs,
+ t2LDRi12,
+ t2LDRi8,
+ t2LDRpci,
+ t2LDRs,
+ tLDRBi,
+ tLDRBr,
+ tLDRHi,
+ tLDRHr,
+ tLDRSB,
+ tLDRSH,
+ tLDRi,
+ tLDRpci,
+ tLDRr,
+ tLDRspi,
+ t2STRBi12,
+ t2STRBi8,
+ t2STRBs,
+ t2STRHi12,
+ t2STRHi8,
+ t2STRHs,
+ t2STRi12,
+ t2STRi8,
+ t2STRs,
+ tSTRBi,
+ tSTRBr,
+ tSTRHi,
+ tSTRHr,
+ tSTRi,
+ tSTRr,
+ tSTRspi,
+ VLDRD,
+ VLDRH,
+ VLDRS,
+ VSTRD,
+ VSTRH,
+ VSTRS,
+ MVE_VLD20_16,
+ MVE_VLD20_32,
+ MVE_VLD20_8,
+ MVE_VLD21_16,
+ MVE_VLD21_32,
+ MVE_VLD21_8,
+ MVE_VLD40_16,
+ MVE_VLD40_32,
+ MVE_VLD40_8,
+ MVE_VLD41_16,
+ MVE_VLD41_32,
+ MVE_VLD41_8,
+ MVE_VLD42_16,
+ MVE_VLD42_32,
+ MVE_VLD42_8,
+ MVE_VLD43_16,
+ MVE_VLD43_32,
+ MVE_VLD43_8,
+ MVE_VLDRBS16,
+ MVE_VLDRBS16_rq,
+ MVE_VLDRBS32,
+ MVE_VLDRBS32_rq,
+ MVE_VLDRBU16,
+ MVE_VLDRBU16_rq,
+ MVE_VLDRBU32,
+ MVE_VLDRBU32_rq,
+ MVE_VLDRBU8,
+ MVE_VLDRBU8_rq,
+ MVE_VLDRDU64_qi,
+ MVE_VLDRDU64_rq,
+ MVE_VLDRDU64_rq_u,
+ MVE_VLDRHS32,
+ MVE_VLDRHS32_rq,
+ MVE_VLDRHS32_rq_u,
+ MVE_VLDRHU16,
+ MVE_VLDRHU16_rq,
+ MVE_VLDRHU16_rq_u,
+ MVE_VLDRHU32,
+ MVE_VLDRHU32_rq,
+ MVE_VLDRHU32_rq_u,
+ MVE_VLDRWU32,
+ MVE_VLDRWU32_qi,
+ MVE_VLDRWU32_rq,
+ MVE_VLDRWU32_rq_u,
+ MVE_VST20_16,
+ MVE_VST20_32,
+ MVE_VST20_8,
+ MVE_VST21_16,
+ MVE_VST21_32,
+ MVE_VST21_8,
+ MVE_VST40_16,
+ MVE_VST40_32,
+ MVE_VST40_8,
+ MVE_VST41_16,
+ MVE_VST41_32,
+ MVE_VST41_8,
+ MVE_VST42_16,
+ MVE_VST42_32,
+ MVE_VST42_8,
+ MVE_VST43_16,
+ MVE_VST43_32,
+ MVE_VST43_8,
+ MVE_VSTRB16,
+ MVE_VSTRB16_rq,
+ MVE_VSTRB32,
+ MVE_VSTRB32_rq,
+ MVE_VSTRBU8,
+ MVE_VSTRB8_rq,
+ MVE_VSTRD64_qi,
+ MVE_VSTRD64_rq,
+ MVE_VSTRD64_rq_u,
+ MVE_VSTRH32,
+ MVE_VSTRH32_rq,
+ MVE_VSTRH32_rq_u,
+ MVE_VSTRHU16,
+ MVE_VSTRH16_rq,
+ MVE_VSTRH16_rq_u,
+ MVE_VSTRWU32,
+ MVE_VSTRW32_qi,
+ MVE_VSTRW32_rq,
+ MVE_VSTRW32_rq_u,
+ };
+ std::initializer_list<unsigned> Address2List = {
+ t2LDRB_POST,
+ t2LDRB_PRE,
+ t2LDRDi8,
+ t2LDRH_POST,
+ t2LDRH_PRE,
+ t2LDRSB_POST,
+ t2LDRSB_PRE,
+ t2LDRSH_POST,
+ t2LDRSH_PRE,
+ t2LDR_POST,
+ t2LDR_PRE,
+ t2STRB_POST,
+ t2STRB_PRE,
+ t2STRDi8,
+ t2STRH_POST,
+ t2STRH_PRE,
+ t2STR_POST,
+ t2STR_PRE,
+ MVE_VLD20_16_wb,
+ MVE_VLD20_32_wb,
+ MVE_VLD20_8_wb,
+ MVE_VLD21_16_wb,
+ MVE_VLD21_32_wb,
+ MVE_VLD21_8_wb,
+ MVE_VLD40_16_wb,
+ MVE_VLD40_32_wb,
+ MVE_VLD40_8_wb,
+ MVE_VLD41_16_wb,
+ MVE_VLD41_32_wb,
+ MVE_VLD41_8_wb,
+ MVE_VLD42_16_wb,
+ MVE_VLD42_32_wb,
+ MVE_VLD42_8_wb,
+ MVE_VLD43_16_wb,
+ MVE_VLD43_32_wb,
+ MVE_VLD43_8_wb,
+ MVE_VLDRBS16_post,
+ MVE_VLDRBS16_pre,
+ MVE_VLDRBS32_post,
+ MVE_VLDRBS32_pre,
+ MVE_VLDRBU16_post,
+ MVE_VLDRBU16_pre,
+ MVE_VLDRBU32_post,
+ MVE_VLDRBU32_pre,
+ MVE_VLDRBU8_post,
+ MVE_VLDRBU8_pre,
+ MVE_VLDRDU64_qi_pre,
+ MVE_VLDRHS32_post,
+ MVE_VLDRHS32_pre,
+ MVE_VLDRHU16_post,
+ MVE_VLDRHU16_pre,
+ MVE_VLDRHU32_post,
+ MVE_VLDRHU32_pre,
+ MVE_VLDRWU32_post,
+ MVE_VLDRWU32_pre,
+ MVE_VLDRWU32_qi_pre,
+ MVE_VST20_16_wb,
+ MVE_VST20_32_wb,
+ MVE_VST20_8_wb,
+ MVE_VST21_16_wb,
+ MVE_VST21_32_wb,
+ MVE_VST21_8_wb,
+ MVE_VST40_16_wb,
+ MVE_VST40_32_wb,
+ MVE_VST40_8_wb,
+ MVE_VST41_16_wb,
+ MVE_VST41_32_wb,
+ MVE_VST41_8_wb,
+ MVE_VST42_16_wb,
+ MVE_VST42_32_wb,
+ MVE_VST42_8_wb,
+ MVE_VST43_16_wb,
+ MVE_VST43_32_wb,
+ MVE_VST43_8_wb,
+ MVE_VSTRB16_post,
+ MVE_VSTRB16_pre,
+ MVE_VSTRB32_post,
+ MVE_VSTRB32_pre,
+ MVE_VSTRBU8_post,
+ MVE_VSTRBU8_pre,
+ MVE_VSTRD64_qi_pre,
+ MVE_VSTRH32_post,
+ MVE_VSTRH32_pre,
+ MVE_VSTRHU16_post,
+ MVE_VSTRHU16_pre,
+ MVE_VSTRWU32_post,
+ MVE_VSTRWU32_pre,
+ MVE_VSTRW32_qi_pre,
+ };
+ std::initializer_list<unsigned> Address3List = {
+ t2LDRD_POST,
+ t2LDRD_PRE,
+ t2STRD_POST,
+ t2STRD_PRE,
+ };
+ // Compute a mask of which operands are involved in address computation
+ for (auto &op : Address1List) {
+ Info[op].AddressOpMask = 0x6;
+ }
+ for (auto &op : Address2List) {
+ Info[op].AddressOpMask = 0xc;
+ }
+ for (auto &op : Address3List) {
+ Info[op].AddressOpMask = 0x18;
+ }
+ for (auto &op : hasBRegAddrShiftList) {
+ Info[op].AddressOpMask |= 0x8;
+ }
+}
+
+void InstructionInformation::markDPProducersConsumers(
+ const ARMBaseInstrInfo *TII) {
+ // Learn about all instructions which have FP source/dest registers
+ for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
+ const MCInstrDesc &MID = TII->get(MI);
+ auto Operands = MID.operands();
+ for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
+ bool MarkQP = false, MarkDP = false, MarkSP = false;
+ switch (Operands[OI].RegClass) {
+ case ARM::MQPRRegClassID:
+ case ARM::DPRRegClassID:
+ case ARM::DPR_8RegClassID:
+ case ARM::DPR_VFP2RegClassID:
+ case ARM::DPairRegClassID:
+ case ARM::DPairSpcRegClassID:
+ case ARM::DQuadRegClassID:
+ case ARM::DQuadSpcRegClassID:
+ case ARM::DTripleRegClassID:
+ case ARM::DTripleSpcRegClassID:
+ MarkDP = true;
+ break;
+ case ARM::QPRRegClassID:
+ case ARM::QPR_8RegClassID:
+ case ARM::QPR_VFP2RegClassID:
+ case ARM::QQPRRegClassID:
+ case ARM::QQQQPRRegClassID:
+ MarkQP = true;
+ break;
+ case ARM::SPRRegClassID:
+ case ARM::SPR_8RegClassID:
+ case ARM::FPWithVPRRegClassID:
+ MarkSP = true;
+ break;
+ default:
+ break;
+ }
+ if (MarkQP) {
+ if (OI < MID.getNumDefs())
+ Info[MI].ProducesQP = true;
+ else
+ Info[MI].ConsumesQP = true;
+ }
+ if (MarkDP) {
+ if (OI < MID.getNumDefs())
+ Info[MI].ProducesDP = true;
+ else
+ Info[MI].ConsumesDP = true;
+ }
+ if (MarkSP) {
+ if (OI < MID.getNumDefs())
+ Info[MI].ProducesSP = true;
+ else
+ Info[MI].ConsumesSP = true;
+ }
+ }
+ }
+}
+
+} // anonymous namespace
+
+static bool hasImplicitCPSRUse(const MachineInstr *MI) {
+ return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
+}
+
+void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
+ unsigned latency) {
+ SDep Reverse = SrcDep;
+ Reverse.setSUnit(&SrcSU);
+ for (SDep &PDep : SrcDep.getSUnit()->Preds) {
+ if (PDep == Reverse) {
+ PDep.setLatency(latency);
+ SrcDep.getSUnit()->setDepthDirty();
+ break;
+ }
+ }
+ SrcDep.setLatency(latency);
+ SrcSU.setHeightDirty();
+}
+
+static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
+ return (a & 0xe) != (b & 0xe);
+}
+
+// Set output dependences to zero latency for processors which can
+// simultaneously issue to the same register. Returns true if a change
+// was made.
+bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
+ if (Dep.getKind() == SDep::Output) {
+ setBidirLatencies(ISU, Dep, 0);
+ return true;
+ }
+ return false;
+}
+
+// The graph doesn't look inside of bundles to determine their
+// scheduling boundaries and reports zero latency into and out of them
+// (except for CPSR into the bundle, which has latency 1).
+// Make some better scheduling assumptions:
+// 1) CPSR uses have zero latency; other uses have incoming latency 1
+// 2) CPSR defs retain a latency of zero; others have a latency of 1.
+//
+// Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
+unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {
+
+ SUnit &DepSU = *Dep.getSUnit();
+ const MachineInstr *SrcMI = ISU.getInstr();
+ unsigned SrcOpcode = SrcMI->getOpcode();
+ const MachineInstr *DstMI = DepSU.getInstr();
+ unsigned DstOpcode = DstMI->getOpcode();
+
+ if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
+ setBidirLatencies(
+ ISU, Dep,
+ (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
+ return 1;
+ }
+ if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
+ Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
+ setBidirLatencies(ISU, Dep, 1);
+ return 2;
+ }
+ return 0;
+}
+
+// Determine whether there is a memory RAW hazard here and set up latency
+// accordingly
+bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
+ unsigned latency) {
+ if (!Dep.isNormalMemory())
+ return false;
+ auto &SrcInst = *ISU.getInstr();
+ auto &DstInst = *Dep.getSUnit()->getInstr();
+ if (!SrcInst.mayStore() || !DstInst.mayLoad())
+ return false;
+
+ auto SrcMO = *SrcInst.memoperands().begin();
+ auto DstMO = *DstInst.memoperands().begin();
+ auto SrcVal = SrcMO->getValue();
+ auto DstVal = DstMO->getValue();
+ auto SrcPseudoVal = SrcMO->getPseudoValue();
+ auto DstPseudoVal = DstMO->getPseudoValue();
+ if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
+ SrcMO->getOffset() == DstMO->getOffset()) {
+ setBidirLatencies(ISU, Dep, latency);
+ return true;
+ } else if (SrcPseudoVal && DstPseudoVal &&
+ SrcPseudoVal->kind() == DstPseudoVal->kind() &&
+ SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
+ // Spills/fills
+ auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
+ auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
+ if (FS0 == FS1) {
+ setBidirLatencies(ISU, Dep, latency);
+ return true;
+ }
+ }
+ return false;
+}
+
+namespace {
+
+class CortexM7InstructionInformation : public InstructionInformation {
+public:
+ CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
+ : InstructionInformation(TII) {}
+};
+
+class CortexM7Overrides : public ARMOverrideBypasses {
+public:
+ CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
+ : ARMOverrideBypasses(TII, AA) {
+ if (!DI)
+ DI.reset(new CortexM7InstructionInformation(TII));
+ }
+
+ void modifyBypasses(SUnit &) override;
+
+private:
+ static std::unique_ptr<InstructionInformation> DI;
+};
+
+std::unique_ptr<InstructionInformation> Cortex...
[truncated]
|
void modifyBypasses(SUnit &) override; | ||
|
||
private: | ||
static std::unique_ptr<InstructionInformation> DI; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have never been suse if this static that is initialized once and not freed would be acceptable with sanatizers and the like. Is there an alternative that hopefully wouldn't involve spending too much compile time re-calculating the info?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With my latest commit I moved DI (now named II) to the enclosing anonymous namespace. However, I think this effectively makes the variable global which could be less than ideal. But it's not exposed by a header or anything so it still preserves it's "private" status I believe
Change-Id: I75658f04eb5c8764e8bb88453d833f320de27009
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I worry that the allocation but not freeing the data will trip up some sanatizers. We might want to rewrite this so that we avoid the table entirely, just checking for the opcodes/data we need on the instructions directly.
If we ned that we can rewrite it, lets see. Otherwise this LGTM.
I'm getting the following warning:
Would you mind taking a look? Thanks! |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/3995 Here is the relevant piece of the build log for the reference
|
@nasherm You do need to add new subtargets to that switch statement when adding them. These two should join the big list of CortexM and CortexR processors which have a simple break for their case. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/168/builds/5473 Here is the relevant piece of the build log for the reference
|
PR llvm#115153 added enums which needed to be handled in a switch statement. This trips up buildbot. Change-Id: Ic361cffb51a0924a77adc573fe653a30fe017a42
I believe I've fixed this in PR #116086 |
PR #115153 added enums which needed to be handled in a switch statement. This trips up buildbot.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/145/builds/3147 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/146/builds/1593 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/55/builds/3541 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/64/builds/1446 Here is the relevant piece of the build log for the reference
|
This patch adds latency mutations as a scheduling related speedup for the above mentioned cores. When benchmarking this pass on selected benchmarks we see a performance improvement of 1% on most benchmarks with some improving by up to 6%.
Author: David Penry david.penry@arm.com
Co-authored-by: Nashe Mncube <nashe.mncube@arm.com