@@ -1634,6 +1634,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
1634
1634
case AArch64::STR_PXI:
1635
1635
case AArch64::LDR_ZXI:
1636
1636
case AArch64::LDR_PXI:
1637
+ case AArch64::PTRUE_B:
1638
+ case AArch64::CPY_ZPzI_B:
1639
+ case AArch64::CMPNE_PPzZI_B:
1637
1640
return I->getFlag (MachineInstr::FrameSetup) ||
1638
1641
I->getFlag (MachineInstr::FrameDestroy);
1639
1642
}
@@ -3265,7 +3268,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
3265
3268
StrOpc = RPI.isPaired () ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
3266
3269
break ;
3267
3270
case RegPairInfo::PPR:
3268
- StrOpc = AArch64::STR_PXI;
3271
+ StrOpc =
3272
+ Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
3269
3273
break ;
3270
3274
case RegPairInfo::VG:
3271
3275
StrOpc = AArch64::STRXui;
@@ -3494,7 +3498,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
3494
3498
LdrOpc = RPI.isPaired () ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
3495
3499
break ;
3496
3500
case RegPairInfo::PPR:
3497
- LdrOpc = AArch64::LDR_PXI;
3501
+ LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
3502
+ : AArch64::LDR_PXI;
3498
3503
break ;
3499
3504
case RegPairInfo::VG:
3500
3505
continue ;
@@ -3720,6 +3725,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
3720
3725
continue ;
3721
3726
}
3722
3727
3728
+ // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
3729
+ // spilled. If all of p0-p3 are used as return values p4 is must be free
3730
+ // to reload p8-p15.
3731
+ if (RegInfo->getSpillSize (AArch64::PPRRegClass) == 16 &&
3732
+ AArch64::PPR_p8to15RegClass.contains (Reg)) {
3733
+ SavedRegs.set (AArch64::P4);
3734
+ }
3735
+
3723
3736
// MachO's compact unwind format relies on all registers being stored in
3724
3737
// pairs.
3725
3738
// FIXME: the usual format is actually better if unwinding isn't needed.
@@ -4159,8 +4172,295 @@ int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
4159
4172
true );
4160
4173
}
4161
4174
4175
+ // / Attempts to scavenge a register from \p ScavengeableRegs given the used
4176
+ // / registers in \p UsedRegs.
4177
+ static Register tryScavengeRegister (LiveRegUnits const &UsedRegs,
4178
+ BitVector const &ScavengeableRegs) {
4179
+ for (auto Reg : ScavengeableRegs.set_bits ()) {
4180
+ if (UsedRegs.available (Reg))
4181
+ return Reg;
4182
+ }
4183
+ return AArch64::NoRegister;
4184
+ }
4185
+
4186
+ // / Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
4187
+ // / \p MachineInstrs.
4188
+ static void propagateFrameFlags (MachineInstr &SourceMI,
4189
+ ArrayRef<MachineInstr *> MachineInstrs) {
4190
+ for (MachineInstr *MI : MachineInstrs) {
4191
+ if (SourceMI.getFlag (MachineInstr::FrameSetup))
4192
+ MI->setFlag (MachineInstr::FrameSetup);
4193
+ if (SourceMI.getFlag (MachineInstr::FrameDestroy))
4194
+ MI->setFlag (MachineInstr::FrameDestroy);
4195
+ }
4196
+ }
4197
+
4198
+ // / RAII helper class for scavenging or spilling a register. On construction
4199
+ // / attempts to find a free register of class \p RC (given \p UsedRegs and \p
4200
+ // / AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
4201
+ // / MaybeSpillFI to free a register. The free'd register is returned via the \p
4202
+ // / FreeReg output parameter. On destruction, if there is a spill, its previous
4203
+ // / value is reloaded. The spilling and scavenging is only valid at the
4204
+ // / insertion point \p MBBI, this class should _not_ be used in places that
4205
+ // / create or manipulate basic blocks, moving the expected insertion point.
4206
+ struct ScopedScavengeOrSpill {
4207
+ ScopedScavengeOrSpill (const ScopedScavengeOrSpill &) = delete ;
4208
+ ScopedScavengeOrSpill (ScopedScavengeOrSpill &&) = delete ;
4209
+
4210
+ ScopedScavengeOrSpill (MachineFunction &MF, MachineBasicBlock &MBB,
4211
+ MachineBasicBlock::iterator MBBI,
4212
+ Register SpillCandidate, const TargetRegisterClass &RC,
4213
+ LiveRegUnits const &UsedRegs,
4214
+ BitVector const &AllocatableRegs,
4215
+ std::optional<int > *MaybeSpillFI)
4216
+ : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast <const AArch64InstrInfo &>(
4217
+ *MF.getSubtarget().getInstrInfo())),
4218
+ TRI (*MF.getSubtarget().getRegisterInfo()) {
4219
+ FreeReg = tryScavengeRegister (UsedRegs, AllocatableRegs);
4220
+ if (FreeReg != AArch64::NoRegister)
4221
+ return ;
4222
+ assert (MaybeSpillFI && " Expected emergency spill slot FI information "
4223
+ " (attempted to spill in prologue/epilogue?)" );
4224
+ if (!MaybeSpillFI->has_value ()) {
4225
+ MachineFrameInfo &MFI = MF.getFrameInfo ();
4226
+ *MaybeSpillFI = MFI.CreateSpillStackObject (TRI.getSpillSize (RC),
4227
+ TRI.getSpillAlign (RC));
4228
+ }
4229
+ FreeReg = SpillCandidate;
4230
+ SpillFI = MaybeSpillFI->value ();
4231
+ TII.storeRegToStackSlot (MBB, MBBI, FreeReg, false , *SpillFI, &RC, &TRI,
4232
+ Register ());
4233
+ }
4234
+
4235
+ bool hasSpilled () const { return SpillFI.has_value (); }
4236
+
4237
+ // / Returns the free register (found from scavenging or spilling a register).
4238
+ Register freeRegister () const { return FreeReg; }
4239
+
4240
+ Register operator *() const { return freeRegister (); }
4241
+
4242
+ ~ScopedScavengeOrSpill () {
4243
+ if (hasSpilled ())
4244
+ TII.loadRegFromStackSlot (MBB, MBBI, FreeReg, *SpillFI, &RC, &TRI,
4245
+ Register ());
4246
+ }
4247
+
4248
+ private:
4249
+ MachineBasicBlock &MBB;
4250
+ MachineBasicBlock::iterator MBBI;
4251
+ const TargetRegisterClass &RC;
4252
+ const AArch64InstrInfo &TII;
4253
+ const TargetRegisterInfo &TRI;
4254
+ Register FreeReg = AArch64::NoRegister;
4255
+ std::optional<int > SpillFI;
4256
+ };
4257
+
4258
+ // / Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
4259
+ // / FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
4260
+ struct EmergencyStackSlots {
4261
+ std::optional<int > ZPRSpillFI;
4262
+ std::optional<int > PPRSpillFI;
4263
+ std::optional<int > GPRSpillFI;
4264
+ };
4265
+
4266
+ // / Registers available for scavenging (ZPR, PPR3b, GPR).
4267
+ struct ScavengeableRegs {
4268
+ BitVector ZPRRegs;
4269
+ BitVector PPR3bRegs;
4270
+ BitVector GPRRegs;
4271
+ };
4272
+
4273
+ static bool isInPrologueOrEpilogue (const MachineInstr &MI) {
4274
+ return MI.getFlag (MachineInstr::FrameSetup) ||
4275
+ MI.getFlag (MachineInstr::FrameDestroy);
4276
+ }
4277
+
4278
+ // / Expands:
4279
+ // / ```
4280
+ // / SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
4281
+ // / ```
4282
+ // / To:
4283
+ // / ```
4284
+ // / $z0 = CPY_ZPzI_B $p0, 1, 0
4285
+ // / STR_ZXI $z0, $stack.0, 0
4286
+ // / ```
4287
+ // / While ensuring a ZPR ($z0 in this example) is free for the predicate (
4288
+ // / spilling if necessary).
4289
+ static void expandSpillPPRToZPRSlotPseudo (MachineBasicBlock &MBB,
4290
+ MachineInstr &MI,
4291
+ const TargetRegisterInfo &TRI,
4292
+ LiveRegUnits const &UsedRegs,
4293
+ ScavengeableRegs const &SR,
4294
+ EmergencyStackSlots &SpillSlots) {
4295
+ MachineFunction &MF = *MBB.getParent ();
4296
+ auto *TII =
4297
+ static_cast <const AArch64InstrInfo *>(MF.getSubtarget ().getInstrInfo ());
4298
+
4299
+ ScopedScavengeOrSpill ZPredReg (
4300
+ MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs ,
4301
+ isInPrologueOrEpilogue (MI) ? nullptr : &SpillSlots.ZPRSpillFI );
4302
+
4303
+ SmallVector<MachineInstr *, 2 > MachineInstrs;
4304
+ const DebugLoc &DL = MI.getDebugLoc ();
4305
+ MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::CPY_ZPzI_B))
4306
+ .addReg (*ZPredReg, RegState::Define)
4307
+ .add (MI.getOperand (0 ))
4308
+ .addImm (1 )
4309
+ .addImm (0 )
4310
+ .getInstr ());
4311
+ MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::STR_ZXI))
4312
+ .addReg (*ZPredReg)
4313
+ .add (MI.getOperand (1 ))
4314
+ .addImm (MI.getOperand (2 ).getImm ())
4315
+ .setMemRefs (MI.memoperands ())
4316
+ .getInstr ());
4317
+ propagateFrameFlags (MI, MachineInstrs);
4318
+ }
4319
+
4320
+ // / Expands:
4321
+ // / ```
4322
+ // / $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
4323
+ // / ```
4324
+ // / To:
4325
+ // / ```
4326
+ // / $z0 = LDR_ZXI %stack.0, 0
4327
+ // / $p0 = PTRUE_B 31, implicit $vg
4328
+ // / $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
4329
+ // / ```
4330
+ // / While ensuring a ZPR ($z0 in this example) is free for the predicate (
4331
+ // / spilling if necessary). If the status flags are in use at the point of
4332
+ // / expansion they are preserved (by moving them to/from a GPR). This may cause
4333
+ // / an additional spill if no GPR is free at the expansion point.
4334
+ static bool expandFillPPRFromZPRSlotPseudo (MachineBasicBlock &MBB,
4335
+ MachineInstr &MI,
4336
+ const TargetRegisterInfo &TRI,
4337
+ LiveRegUnits const &UsedRegs,
4338
+ ScavengeableRegs const &SR,
4339
+ EmergencyStackSlots &SpillSlots) {
4340
+ MachineFunction &MF = *MBB.getParent ();
4341
+ auto *TII =
4342
+ static_cast <const AArch64InstrInfo *>(MF.getSubtarget ().getInstrInfo ());
4343
+
4344
+ ScopedScavengeOrSpill ZPredReg (
4345
+ MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs ,
4346
+ isInPrologueOrEpilogue (MI) ? nullptr : &SpillSlots.ZPRSpillFI );
4347
+
4348
+ ScopedScavengeOrSpill PredReg (
4349
+ MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs ,
4350
+ isInPrologueOrEpilogue (MI) ? nullptr : &SpillSlots.PPRSpillFI );
4351
+
4352
+ // Elide NZCV spills if we know it is not used.
4353
+ bool IsNZCVUsed = !UsedRegs.available (AArch64::NZCV);
4354
+ std::optional<ScopedScavengeOrSpill> NZCVSaveReg;
4355
+ if (IsNZCVUsed)
4356
+ NZCVSaveReg.emplace (
4357
+ MF, MBB, MI, AArch64::X0, AArch64::GPR64RegClass, UsedRegs, SR.GPRRegs ,
4358
+ isInPrologueOrEpilogue (MI) ? nullptr : &SpillSlots.GPRSpillFI );
4359
+ SmallVector<MachineInstr *, 4 > MachineInstrs;
4360
+ const DebugLoc &DL = MI.getDebugLoc ();
4361
+ MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::LDR_ZXI))
4362
+ .addReg (*ZPredReg, RegState::Define)
4363
+ .add (MI.getOperand (1 ))
4364
+ .addImm (MI.getOperand (2 ).getImm ())
4365
+ .setMemRefs (MI.memoperands ())
4366
+ .getInstr ());
4367
+ if (IsNZCVUsed)
4368
+ MachineInstrs.push_back (
4369
+ BuildMI (MBB, MI, DL, TII->get (AArch64::MRS))
4370
+ .addReg (NZCVSaveReg->freeRegister (), RegState::Define)
4371
+ .addImm (AArch64SysReg::NZCV)
4372
+ .addReg (AArch64::NZCV, RegState::Implicit)
4373
+ .getInstr ());
4374
+ MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::PTRUE_B))
4375
+ .addReg (*PredReg, RegState::Define)
4376
+ .addImm (31 ));
4377
+ MachineInstrs.push_back (
4378
+ BuildMI (MBB, MI, DL, TII->get (AArch64::CMPNE_PPzZI_B))
4379
+ .addReg (MI.getOperand (0 ).getReg (), RegState::Define)
4380
+ .addReg (*PredReg)
4381
+ .addReg (*ZPredReg)
4382
+ .addImm (0 )
4383
+ .addReg (AArch64::NZCV, RegState::ImplicitDefine)
4384
+ .getInstr ());
4385
+ if (IsNZCVUsed)
4386
+ MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::MSR))
4387
+ .addImm (AArch64SysReg::NZCV)
4388
+ .addReg (NZCVSaveReg->freeRegister ())
4389
+ .addReg (AArch64::NZCV, RegState::ImplicitDefine)
4390
+ .getInstr ());
4391
+
4392
+ propagateFrameFlags (MI, MachineInstrs);
4393
+ return PredReg.hasSpilled ();
4394
+ }
4395
+
4396
+ // / Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
4397
+ // / operations within the MachineBasicBlock \p MBB.
4398
+ static bool expandSMEPPRToZPRSpillPseudos (MachineBasicBlock &MBB,
4399
+ const TargetRegisterInfo &TRI,
4400
+ ScavengeableRegs const &SR,
4401
+ EmergencyStackSlots &SpillSlots) {
4402
+ LiveRegUnits UsedRegs (TRI);
4403
+ UsedRegs.addLiveOuts (MBB);
4404
+ bool HasPPRSpills = false ;
4405
+ for (MachineInstr &MI : make_early_inc_range (reverse (MBB))) {
4406
+ UsedRegs.stepBackward (MI);
4407
+ switch (MI.getOpcode ()) {
4408
+ case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
4409
+ HasPPRSpills |= expandFillPPRFromZPRSlotPseudo (MBB, MI, TRI, UsedRegs, SR,
4410
+ SpillSlots);
4411
+ MI.eraseFromParent ();
4412
+ break ;
4413
+ case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
4414
+ expandSpillPPRToZPRSlotPseudo (MBB, MI, TRI, UsedRegs, SR, SpillSlots);
4415
+ MI.eraseFromParent ();
4416
+ break ;
4417
+ default :
4418
+ break ;
4419
+ }
4420
+ }
4421
+
4422
+ return HasPPRSpills;
4423
+ }
4424
+
4162
4425
void AArch64FrameLowering::processFunctionBeforeFrameFinalized (
4163
4426
MachineFunction &MF, RegScavenger *RS) const {
4427
+
4428
+ AArch64FunctionInfo *AFI = MF.getInfo <AArch64FunctionInfo>();
4429
+ const TargetSubtargetInfo &TSI = MF.getSubtarget ();
4430
+ const TargetRegisterInfo &TRI = *TSI.getRegisterInfo ();
4431
+
4432
+ // If predicates spills are 16-bytes we may need to expand
4433
+ // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
4434
+ if (AFI->hasStackFrame () && TRI.getSpillSize (AArch64::PPRRegClass) == 16 ) {
4435
+ auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
4436
+ BitVector Regs = TRI.getAllocatableSet (MF, TRI.getRegClass (RegClassID));
4437
+ assert (Regs.count () > 0 && " Expected scavengeable registers" );
4438
+ return Regs;
4439
+ };
4440
+
4441
+ ScavengeableRegs SR{};
4442
+ SR.ZPRRegs = ComputeScavengeableRegisters (AArch64::ZPRRegClassID);
4443
+ // Only p0-7 are possible as the second operand of cmpne (needed for fills).
4444
+ SR.PPR3bRegs = ComputeScavengeableRegisters (AArch64::PPR_3bRegClassID);
4445
+ SR.GPRRegs = ComputeScavengeableRegisters (AArch64::GPR64RegClassID);
4446
+
4447
+ EmergencyStackSlots SpillSlots;
4448
+ for (MachineBasicBlock &MBB : MF) {
4449
+ // In the case we had to spill a predicate (in the range p0-p7) to reload
4450
+ // a predicate (>= p8), additional spill/fill pseudos will be created.
4451
+ // These need an additional expansion pass. Note: There will only be at
4452
+ // most two expansion passes, as spilling/filling a predicate in the range
4453
+ // p0-p7 never requires spilling another predicate.
4454
+ for (int Pass = 0 ; Pass < 2 ; Pass++) {
4455
+ bool HasPPRSpills =
4456
+ expandSMEPPRToZPRSpillPseudos (MBB, TRI, SR, SpillSlots);
4457
+ assert ((Pass == 0 || !HasPPRSpills) && " Did not expect PPR spills" );
4458
+ if (!HasPPRSpills)
4459
+ break ;
4460
+ }
4461
+ }
4462
+ }
4463
+
4164
4464
MachineFrameInfo &MFI = MF.getFrameInfo ();
4165
4465
4166
4466
assert (getStackGrowthDirection () == TargetFrameLowering::StackGrowsDown &&
@@ -4170,7 +4470,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
4170
4470
int64_t SVEStackSize =
4171
4471
assignSVEStackObjectOffsets (MFI, MinCSFrameIndex, MaxCSFrameIndex);
4172
4472
4173
- AArch64FunctionInfo *AFI = MF.getInfo <AArch64FunctionInfo>();
4174
4473
AFI->setStackSizeSVE (alignTo (SVEStackSize, 16U ));
4175
4474
AFI->setMinMaxSVECSFrameIndex (MinCSFrameIndex, MaxCSFrameIndex);
4176
4475
@@ -5204,9 +5503,13 @@ void AArch64FrameLowering::emitRemarks(
5204
5503
5205
5504
unsigned RegTy = StackAccess::AccessType::GPR;
5206
5505
if (MFI.getStackID (FrameIdx) == TargetStackID::ScalableVector) {
5207
- if (AArch64::PPRRegClass.contains (MI.getOperand (0 ).getReg ()))
5506
+ // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
5507
+ // spill/fill the predicate as a data vector (so are an FPR acess).
5508
+ if (MI.getOpcode () != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
5509
+ MI.getOpcode () != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
5510
+ AArch64::PPRRegClass.contains (MI.getOperand (0 ).getReg ())) {
5208
5511
RegTy = StackAccess::PPR;
5209
- else
5512
+ } else
5210
5513
RegTy = StackAccess::FPR;
5211
5514
} else if (AArch64InstrInfo::isFpOrNEON (MI)) {
5212
5515
RegTy = StackAccess::FPR;
0 commit comments