-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle > 64 registers + predicate registers for Arm64 #98258
Conversation
This is ready for review. Please go through it and let me know what you think. It seems a one-time cost of around ~10% regression is what we are getting for MinOpts and ~4% regression on FullOpts is what we are looking for. The regression is currently just on arm64 platform. The non-arm64 are mostly untouched. I have not yet updated the names of following typedefs and would like some suggestions. Here is my proposal:
|
@kunalspathak you should nominate some people specifically for review. Also seems like you ought to remove the "NO" labels on the PR. |
My two cents:
On a related note I think we shouldn't mix naming conventions for some of the new fields, like |
AllRegsMask_STOP_FOR_GC_TRASH = | ||
AllRegsMask((RBM_INT_CALLEE_TRASH & ~RBM_INTRET), (RBM_FLT_CALLEE_TRASH & ~RBM_FLOATRET), RBM_MSK_CALLEE_TRASH); | ||
AllRegsMask_PROFILER_ENTER_TRASH = AllRegsMask_CALLEE_TRASH; | ||
#endif // UNIX_AMD64_ABI | ||
|
||
AllRegsMask_PROFILER_LEAVE_TRASH = AllRegsMask_STOP_FOR_GC_TRASH; | ||
AllRegsMask_PROFILER_TAILCALL_TRASH = AllRegsMask_PROFILER_LEAVE_TRASH; | ||
|
||
// The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. | ||
AllRegsMask_INIT_PINVOKE_FRAME_TRASH = AllRegsMask_CALLEE_TRASH; | ||
AllRegsMask_VALIDATE_INDIRECT_CALL_TRASH = GprRegsMask(RBM_VALIDATE_INDIRECT_CALL_TRASH); | ||
|
||
#elif defined(TARGET_ARM) | ||
|
||
AllRegsMask_CALLEE_TRASH_NOGC = GprRegsMask(RBM_CALLEE_TRASH_NOGC); | ||
AllRegsMask_PROFILER_ENTER_TRASH = AllRegsMask_NONE; | ||
|
||
// Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. | ||
AllRegsMask_CALLEE_TRASH_WRITEBARRIER = GprRegsMask(RBM_R0 | RBM_R3 | RBM_LR | RBM_DEFAULT_HELPER_CALL_TARGET); | ||
|
||
// Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. | ||
AllRegsMask_CALLEE_GCTRASH_WRITEBARRIER = AllRegsMask_CALLEE_TRASH_WRITEBARRIER; | ||
|
||
// Registers killed by CORINFO_HELP_ASSIGN_BYREF. | ||
AllRegsMask_CALLEE_TRASH_WRITEBARRIER_BYREF = | ||
GprRegsMask(RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF | RBM_CALLEE_TRASH_NOGC); | ||
|
||
// Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. | ||
// Note that r0 and r1 are still valid byref pointers after this helper call, despite their value being changed. | ||
AllRegsMask_CALLEE_GCTRASH_WRITEBARRIER_BYREF = AllRegsMask_CALLEE_TRASH_NOGC; | ||
AllRegsMask_PROFILER_RET_SCRATCH = GprRegsMask(RBM_R2); | ||
// While REG_PROFILER_RET_SCRATCH is not trashed by the method, the register allocator must | ||
// consider it killed by the return. | ||
AllRegsMask_PROFILER_LEAVE_TRASH = AllRegsMask_PROFILER_RET_SCRATCH; | ||
AllRegsMask_PROFILER_TAILCALL_TRASH = AllRegsMask_NONE; | ||
// The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper (JIT_RareDisableHelper). | ||
// See vm\arm\amshelpers.asm for more details. | ||
AllRegsMask_STOP_FOR_GC_TRASH = | ||
AllRegsMask((RBM_INT_CALLEE_TRASH & ~(RBM_LNGRET | RBM_R7 | RBM_R8 | RBM_R11)), | ||
(RBM_FLT_CALLEE_TRASH & ~(RBM_DOUBLERET | RBM_F2 | RBM_F3 | RBM_F4 | RBM_F5 | RBM_F6 | RBM_F7))); | ||
// The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. | ||
AllRegsMask_INIT_PINVOKE_FRAME_TRASH = | ||
(AllRegsMask_CALLEE_TRASH | GprRegsMask(RBM_PINVOKE_TCB | RBM_PINVOKE_SCRATCH)); | ||
|
||
AllRegsMask_VALIDATE_INDIRECT_CALL_TRASH = GprRegsMask(RBM_INT_CALLEE_TRASH); | ||
|
||
#elif defined(TARGET_ARM64) | ||
|
||
AllRegsMask_CALLEE_TRASH_NOGC = GprRegsMask(RBM_CALLEE_TRASH_NOGC); | ||
AllRegsMask_PROFILER_ENTER_TRASH = AllRegsMask((RBM_INT_CALLEE_TRASH & ~(RBM_ARG_REGS | RBM_ARG_RET_BUFF | RBM_FP)), | ||
(RBM_FLT_CALLEE_TRASH & ~RBM_FLTARG_REGS), RBM_MSK_CALLEE_TRASH); | ||
// Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. | ||
AllRegsMask_CALLEE_TRASH_WRITEBARRIER = GprRegsMask(RBM_R14 | RBM_CALLEE_TRASH_NOGC); | ||
|
||
// Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. | ||
AllRegsMask_CALLEE_GCTRASH_WRITEBARRIER = AllRegsMask_CALLEE_TRASH_NOGC; | ||
|
||
// Registers killed by CORINFO_HELP_ASSIGN_BYREF. | ||
AllRegsMask_CALLEE_TRASH_WRITEBARRIER_BYREF = | ||
GprRegsMask(RBM_WRITE_BARRIER_DST_BYREF | RBM_WRITE_BARRIER_SRC_BYREF | RBM_CALLEE_TRASH_NOGC); | ||
|
||
// Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. | ||
// Note that x13 and x14 are still valid byref pointers after this helper call, despite their value being changed. | ||
AllRegsMask_CALLEE_GCTRASH_WRITEBARRIER_BYREF = AllRegsMask_CALLEE_TRASH_NOGC; | ||
|
||
AllRegsMask_PROFILER_LEAVE_TRASH = AllRegsMask_PROFILER_ENTER_TRASH; | ||
AllRegsMask_PROFILER_TAILCALL_TRASH = AllRegsMask_PROFILER_ENTER_TRASH; | ||
|
||
// The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper | ||
AllRegsMask_STOP_FOR_GC_TRASH = AllRegsMask_CALLEE_TRASH; | ||
// The registers trashed by the CORINFO_HELP_INIT_PINVOKE_FRAME helper. | ||
AllRegsMask_INIT_PINVOKE_FRAME_TRASH = AllRegsMask_CALLEE_TRASH; | ||
AllRegsMask_VALIDATE_INDIRECT_CALL_TRASH = GprRegsMask(RBM_VALIDATE_INDIRECT_CALL_TRASH); | ||
#endif | ||
|
||
#if defined(TARGET_ARM) | ||
// profiler scratch remains gc live | ||
AllRegsMask_PROF_FNC_LEAVE = AllRegsMask_PROFILER_LEAVE_TRASH & ~AllRegsMask_PROFILER_RET_SCRATCH; | ||
#else | ||
AllRegsMask_PROF_FNC_LEAVE = AllRegsMask_PROFILER_LEAVE_TRASH; | ||
#endif // TARGET_ARM | ||
|
||
#ifdef TARGET_XARCH | ||
|
||
// Make sure we copy the register info and initialize the | ||
// trash regs after the underlying fields are initialized | ||
|
||
const regMaskTP vtCalleeTrashRegs[TYP_COUNT]{ | ||
#define DEF_TP(tn, nm, jitType, sz, sze, asze, st, al, regTyp, regFld, csr, ctr, tf) ctr, | ||
#include "typelist.h" | ||
#undef DEF_TP | ||
}; | ||
memcpy(varTypeCalleeTrashRegs, vtCalleeTrashRegs, sizeof(regMaskTP) * TYP_COUNT); | ||
|
||
if (codeGen != nullptr) | ||
{ | ||
codeGen->CopyRegisterInfo(); | ||
} | ||
#endif // TARGET_XARCH | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You may have said this earlier, but why can these not be static variables with values baked into the .dll?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because compiler
object has ISA information that we use to determine the float/mask registers to include. #98258 (comment)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do all of these sets need dynamic creation? Isn't it just the ones defined here that do?
runtime/src/coreclr/jit/codegeninterface.h
Lines 62 to 91 in b7c3446
#if defined(TARGET_AMD64) | |
regMaskTP rbmAllFloat; | |
regMaskTP rbmFltCalleeTrash; | |
FORCEINLINE regMaskTP get_RBM_ALLFLOAT() const | |
{ | |
return this->rbmAllFloat; | |
} | |
FORCEINLINE regMaskTP get_RBM_FLT_CALLEE_TRASH() const | |
{ | |
return this->rbmFltCalleeTrash; | |
} | |
#endif // TARGET_AMD64 | |
#if defined(TARGET_XARCH) | |
regMaskTP rbmAllMask; | |
regMaskTP rbmMskCalleeTrash; | |
// Call this function after the equivalent fields in Compiler have been initialized. | |
void CopyRegisterInfo(); | |
FORCEINLINE regMaskTP get_RBM_ALLMASK() const | |
{ | |
return this->rbmAllMask; | |
} | |
FORCEINLINE regMaskTP get_RBM_MSK_CALLEE_TRASH() const | |
{ | |
return this->rbmMskCalleeTrash; | |
} | |
#endif // TARGET_XARCH |
Probably the name should
|
|
||
// Represents that the mask in this type is from one of the register type - gpr/float/predicate | ||
// but not more than 1. | ||
typedef unsigned __int64 regMaskOnlyOne; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to have this type come with a tag that can ensure (in DEBUG only) that we don't try to do operations with mismatched register types? Currently I assume that would result in bogus results.
TP cost still seems awfully high... @jakobbotsch had an idea which might give us the freedom to explore how to control costs better, and also unblock dependent work, with (we suspect) very little or no downside. What if we restrict arm64 for the time being to only be able to allocate 24 FP registers? Then we can fit 32 GPR + 24 FP + 8 Mask into 64 bits, presumably with fairly minimal TP impact. Likely we have no cases where we really need more than 24 FP regs, so there won't be much CQ impact either. We will still need to solve the > 64 allocatable things problem but, we'll have time to work on it independently. |
I suspect it'll also be rare to require more than 8 mask registers. There are a large group of instructions (at least 200, see use of We might want to reserve two mask registers for all zero and all ones as that is very common usage. Due to the low predicate instructions, these would have to come from the low predicates registers. Maybe this is a good use for predicates Thinking wider, how many other registers are fixed? |
I expect we're going to end up paying more in terms of actual execution cost by trying to play funny tricks with bitpacking (i.e. using only 29 fp registers) than we would be just using the extra space. I expect there is going to be a short term higher cost to getting the support in here, no matter what route we take, and we're ultimately going to need the same work done for the x64 APX feature. While many functions may not need the full register set, there are many instructions which have special allocation requirements (like 4 sequential registers or having to start at a register divisible by I'd also like to call out again that it's very hard to gauge actual cost by TP numbers alone. SPMI doesn't factor in the overhead of the VM calls/token resolution, it doesn't really factor in the difference between debug vs release costs, it doesn't factor in that methods that are substantially slower may be infrequently compiled (excluding crossgen), it doesn't factor in that instructions can be pipelined, fused, or that some single instructions (division or multiplication) can have the cost of dozens of other instructions. We've seen this nuance in the actual perf numbers we track and triage weekly, such as https://pvscmdupload.blob.core.windows.net/reports/allTestHistory%2frefs%2fheads%2fmain_x64_Windows%2010.0.18362_RunKind%3dcrossgen_scenarios%2fCrossgen2%20Throughput%20-%20Single%20-%20System.Private.CoreLib.html, where the single biggest jump in that was the enabling of DPGO and the steady increase since is due to the organic growth of S.P.Corelib otherwise. The increase in time for changes like this (even being "10% throughput impact") are incredibly minimal in comparison to simply adding new APIs to the BCL or broadly enabling new optimizations. I think we should worry more about pushing this in the right direction and ensuring the code is understandable/maintainable, and then longer term get it optimized around what we land on from that perspective. |
Can't agree more on what @tannergooding says.
Yes, and we might introduce bugs in longer run and would have to add more workarounds to deal with them.
Yes, this is precisely my thinking that I expressed somewhere above. The entire code base had an assumption that we will not surpass more than 64 registers and now, we do. So, we need to include that functionality in various places. It is similar to how arm32 has higher TP cost just because of special handling that is needed to handle even-odd pair of registers, that is not present in other platforms.
And there were several routes (4~5 prototypes) that were explored in last couple of months just in pursuit of bringing the TP numbers (reported by superpmi down), after which I had to settle down on current solution, which is simpler and more importantly maintainable.
Back when I added consecutive registers support in #80297, I had to disable some special stress register modes just because that wouldn't satisfy the register requirements for the method, given that they needed consecutive registers at multiple places within the same method.
For sure. I have a work item in mind to reduce the size of float/vector registers field from 8 bytes to 4 bytes. With that, all the register masks will be reduced to 4 bytes, which will reduce the size of common data structures like
The numbers I collected in #99658 (comment) proves that there was no impact seen on crossgen2 throughput. Even my previous TP improvements done in #96386, #85144, #87424 and #85842 that combined improved TP numbers by around 15%, none of that showed up in https://pvscmdupload.blob.core.windows.net/reports/allTestHistory%2frefs%2fheads%2fmain_x64_Windows%2010.0.18362_RunKind%3dcrossgen_scenarios%2fCrossgen2%20Throughput%20-%20Single%20-%20System.Private.CoreLib.html.
Yes, I would like to get more feedback on "understandable/maintainable" and "code readability" part.
@AndyAyersMS - I assume you are talking about having these restrictions on methods that need mask registers, but continue to have 32 fp registers otherwise. We will still not know about it until we get past importer, but by then, we populate most of the register masks like callee-save and callee-trash masks, etc. and will need to reset. |
The idea would be to just treat some FP registers to not exist universally, so the change would be simple. From my side it was merely a suggestion on how to unblock work if we were unhappy about taking the TP regressions. The wall clock measurements above clearly show MinOpts impact in clrjit.dll (what it translates to on actual startup scenarios is another question).
IMO the current solution seems complex and less maintainable. It adds multiple thousand lines of code and makes it possible to silently get register set operations wrong (like union between two It's still surprising to me that just bumping
I agree it would be great to have these optimizations.
Does this measure a significant number of MinOpts compilations? I assume crossgen2 is compiling everything in FullOpts, and we know that crossgen2 itself interacts poorly with tiered compilation (#83112), so it might just be dominated by other costs than jitting. |
I tried your PR at #96196 and see the following for benchmarks.run_pgo. This is arm64 cross compiled from x64 where I use the intrinsic for Base: 141063333131, Diff: 146798692822, +4.0658%
1339309854 : +33.69% : 20.61% : +0.9494% : public: unsigned __int64 __cdecl LinearScan::RegisterSelection::select<0>(class Interval *, class RefPosition *)
790662416 : +12966.72% : 12.16% : +0.5605% : public: void __cdecl Interval::mergeRegisterPreferences(unsigned __int64)
668963185 : +22.84% : 10.29% : +0.4742% : public: void __cdecl LinearScan::allocateRegisters<0>(void)
534637331 : +26.96% : 8.23% : +0.3790% : private: void __cdecl LinearScan::processBlockStartLocations(struct BasicBlock *)
358948739 : +31.97% : 5.52% : +0.2545% : public: void __cdecl LinearScan::allocateRegistersMinimal(void)
318294318 : +25.55% : 4.90% : +0.2256% : private: class RefPosition * __cdecl LinearScan::newRefPosition(class Interval *, unsigned int, enum RefType, struct GenTree *, unsigned __int64, unsigned int)
256068471 : +39.17% : 3.94% : +0.1815% : protected: enum _regNumber_enum __cdecl CodeGen::genConsumeReg(struct GenTree *)
230314932 : +21.92% : 3.54% : +0.1633% : private: void __cdecl LinearScan::associateRefPosWithInterval(class RefPosition *)
153840898 : +34.77% : 2.37% : +0.1091% : private: void __cdecl LinearScan::addRefsForPhysRegMask(unsigned __int64, unsigned int, enum RefType, bool)
153099855 : +40.08% : 2.36% : +0.1085% : private: void __cdecl LinearScan::freeRegisters(unsigned __int64)
124799070 : +122.19% : 1.92% : +0.0885% : public: void __cdecl GCInfo::gcMarkRegPtrVal(enum _regNumber_enum, enum var_types)
104909768 : +10.68% : 1.61% : +0.0744% : protected: void __cdecl CodeGen::genCodeForBBlist(void)
85800424 : NA : 1.32% : +0.0608% : public: void __cdecl emitter::emitUpdateLiveGCregs(enum GCtype, unsigned __int64, unsigned char *)
72840134 : +8.71% : 1.12% : +0.0516% : private: int __cdecl LinearScan::BuildNode(struct GenTree *) So the vast majority of the TP impact is coming from a small number of functions within LSRA. I wonder if it would worth it to try to restrict the introduction of the segregated register sets to LSRA only such that the rest of the JIT doesn't need to learn about the differences. |
Startup impactI did some measurements on TE benchmarks and measured startup and first request time. I barely see ~1.5% regression.
10 iterations dataFortunes
Json-Minimal
Json Mvc
Orchard
Crossgen2 throughout impactCrossgen2 Throughput data: #99658 (comment) TP impactNow, let's take a look at TP impact reported by superpmi. Looking at the TP difference for Overall (+4.25%)
MinOpts (+6.47%)
FullOpts (+2.57%)
?allocateRegistersMinimal@LinearScan@@QEAAXXZ : 340392599 : +45.58% : 25.24% : +2.9085%
?addRefsForPhysRegMask@LinearScan@@AEAAXAEBU_regMaskAll@@IW4RefType@@_N@Z : 184181130 : NA : 13.66% : +1.5737%
?freeRegisters@LinearScan@@AEAAXU_regMaskAll@@@Z : 86201120 : NA : 6.39% : +0.7365%
?allocateRegMinimal@LinearScan@@AEAA?AW4_regNumber_enum@@PEAVInterval@@PEAVRefPosition@@@Z : 81623577 : +13.48% : 6.05% : +0.6974%
?freeRegister@LinearScan@@AEAAXPEAVRegRecord@@@Z : 70217892 : NA : 5.21% : +0.6000%
?gtGetGprRegMask@GenTree@@QEBA_KXZ : 45216669 : NA : 3.35% : +0.3863%
?writeRegisters@LinearScan@@QEAAXPEAVRefPosition@@PEAUGenTree@@@Z : 30848120 : NA : 2.29% : +0.2636%
?updateAssignedInterval@LinearScan@@AEAAXPEAVRegRecord@@PEAVInterval@@@Z : 28280466 : +40.08% : 2.10% : +0.2416%
?newRefPosition@LinearScan@@AEAAPEAVRefPosition@@PEAVInterval@@IW4RefType@@PEAUGenTree@@_KI@Z : 23259358 : +8.59% : 1.72% : +0.1987%
?unassignPhysReg@LinearScan@@AEAAXPEAVRegRecord@@PEAVRefPosition@@@Z : 17337530 : +21.69% : 1.29% : +0.1481%
?assignPhysReg@LinearScan@@AEAAXPEAVRegRecord@@PEAVInterval@@@Z : 16785216 : +27.27% : 1.24% : +0.1434%
?buildKillPositionsForNode@LinearScan@@AEAA_NPEAUGenTree@@IAEBU_regMaskAll@@@Z : 11895844 : NA : 0.88% : +0.1016%
??0LinearScan@@QEAA@PEAVCompiler@@@Z : 11645946 : +50.81% : 0.86% : +0.0995%
?PopCount@BitOperations@@SAI_K@Z : 10508274 : +134.96% : 0.78% : +0.0898%
?gcMarkRegPtrVal@GCInfo@@QEAAXW4_regNumber_enum@@W4var_types@@@Z : 9900926 : +40.80% : 0.73% : +0.0846%
?genConsumeReg@CodeGen@@IEAA?AW4_regNumber_enum@@PEAUGenTree@@@Z : 8915253 : +7.52% : 0.66% : +0.0762%
?BuildNode@LinearScan@@AEAAHPEAUGenTree@@@Z : 8813598 : +5.66% : 0.65% : +0.0753%
?newRefPositionRaw@LinearScan@@AEAAPEAVRefPosition@@IPEAUGenTree@@W4RefType@@@Z : 7806564 : +1.89% : 0.58% : +0.0667%
?buildPhysRegRecords@LinearScan@@AEAAXXZ : 7529067 : +16.06% : 0.56% : +0.0643%
?BuildCall@LinearScan@@AEAAHPEAUGenTreeCall@@@Z : 5614378 : +14.60% : 0.42% : +0.0480%
?genCodeForTreeNode@CodeGen@@IEAAXPEAUGenTree@@@Z : 5226233 : +3.53% : 0.39% : +0.0447%
?ins_Copy@CodeGen@@QEAA?AW4instruction@@W4_regNumber_enum@@W4var_types@@@Z : 4644888 : NA : 0.34% : +0.0397%
?genProduceReg@CodeGen@@IEAAXPEAUGenTree@@@Z : 3660030 : +3.24% : 0.27% : +0.0313%
?allocateMemory@ArenaAllocator@@QEAAPEAX_K@Z : 3084940 : +0.89% : 0.23% : +0.0264%
?genSetRegToConst@CodeGen@@IEAAXW4_regNumber_enum@@W4var_types@@PEAUGenTree@@@Z : 3041553 : +30.06% : 0.23% : +0.0260%
?associateRefPosWithInterval@LinearScan@@AEAAXPEAVRefPosition@@@Z : 2856060 : +1.26% : 0.21% : +0.0244%
?instGen_Set_Reg_To_Imm@CodeGen@@QEAAXW4emitAttr@@W4_regNumber_enum@@_JW4insFlags@@@Z : 2522843 : +6.25% : 0.19% : +0.0216%
?genRestoreCalleeSavedRegistersHelp@CodeGen@@IEAAXAEBU_regMaskAll@@HH@Z : 2100312 : NA : 0.16% : +0.0179%
?emitOutputInstr@emitter@@IEAA_KPEAUinsGroup@@PEAUinstrDesc@1@PEAPEAE@Z : 1693293 : +0.44% : 0.13% : +0.0145%
?genSaveCalleeSavedRegistersHelp@CodeGen@@IEAAXAEBU_regMaskAll@@HH@Z : 1689330 : NA : 0.13% : +0.0144%
?compCompileHelper@Compiler@@QEAAHPEAUCORINFO_MODULE_STRUCT_@@PEAVICorJitInfo@@PEAUCORINFO_METHOD_INFO@@PEAPEAXPEAIPEAVJitFlags@@@Z : 1594827 : +17.09% : 0.12% : +0.0136%
?HasMultiRegRetVal@GenTreeCall@@QEBA_NXZ : -1430094 : -12.99% : 0.11% : -0.0122%
?BuildDefsWithKills@LinearScan@@AEAAXPEAUGenTree@@H_K1@Z : -1555166 : -100.00% : 0.12% : -0.0133%
?inst_Mov@CodeGen@@QEAAXW4var_types@@W4_regNumber_enum@@1_NW4emitAttr@@W4insFlags@@@Z : -1927004 : -13.42% : 0.14% : -0.0165%
?UpdateLifeVar@?$TreeLifeUpdater@$00@@AEAAXPEAUGenTree@@PEAUGenTreeLclVarCommon@@@Z : -2246490 : -8.57% : 0.17% : -0.0192%
?resetAllRegistersState@LinearScan@@AEAAXXZ : -3366369 : -6.09% : 0.25% : -0.0288%
?BuildUse@LinearScan@@AEAAPEAVRefPosition@@PEAUGenTree@@_KH@Z : -5405424 : -3.81% : 0.40% : -0.0462%
?updateMaxSpill@LinearScan@@QEAAXPEAVRefPosition@@@Z : -7422214 : -9.99% : 0.55% : -0.0634%
??$resolveRegisters@$0A@@LinearScan@@QEAAXXZ : -10955233 : -4.12% : 0.81% : -0.0936%
?buildKillPositionsForNode@LinearScan@@AEAA_NPEAUGenTree@@I_K@Z : -11299476 : -100.00% : 0.84% : -0.0965%
?BuildDefs@LinearScan@@AEAAXPEAUGenTree@@H_K@Z : -12740041 : -100.00% : 0.94% : -0.1089%
?gtGetRegMask@GenTree@@QEBA_KXZ : -30758149 : -100.00% : 2.28% : -0.2628%
?freeRegisters@LinearScan@@AEAAX_K@Z : -94590308 : -100.00% : 7.02% : -0.8082%
?addRefsForPhysRegMask@LinearScan@@AEAAX_KIW4RefType@@_N@Z : -104673921 : -100.00% : 7.76% : -0.8944%
Most of the regression is coming from |
Added a |
Review guide
Predicate Registers
registerarm64.h
adds the 16 new predicate registers numbered64
thru79
. Their masks are from0x0
thru0x8000
. Following files adds the predicate registers. On arm64, now we need 7 bits to represent the register number and hence theREGNUM_BITS
has changed from 6 -> 7 bits (targetarm64.h
).AllRegsMask
struct
is introduced to represent the register mask. IfHAS_MORE_THAN_64_REGISTERS
is not defined (for all non-arm64 platforms), this contains a single 64-bit field. But for arm64, (HAS_MORE_THAN_64_REGISTERS
is defined), the struct contains an extra field of 4-bytes to represent the predicate registers. The definition of this struct is present intarget.h
and is very similar to how I mentioned it here.Few things that are worth explaining for this struct:
HAS_MORE_THAN_64_REGISTERS
is not defined, the struct will continue to operate on a 64-bits field, thus not impacting the TP of these platforms.gpr
,float
andpredicate
registers, in that order. The way they are defined underunion
is so that theAllRegsMask
can access relevant fields without any branches or conditions. For e.g. If a float registerd2
has to be added in the mask, I did not want to have something like:Instead, a mapping is added in all
register*.h
files to map allgpr registers -> 0
,float registers -> 1
andpredicate registers -> 2
. Having that, I could rewrite the above code as:Using predicate registers from the mask is uncommon and mostly, the consumers of
AllRegsMask
are interested ingpr
/float
registers. Hence a_combinedRegisters
field is added in theunion
to have easy access of them.This design also provide an easy way to retrieve just
gprRegs()
orfloatRegs()
orpredicateRegs()
easily.Until now, the manipulation of registers (adding/removing registers) from the mask was trivial and was done using bit manipulation. However, with
AllRegsMask
struct, it offers new methods to do such manipulation.regNumber
as input already know which field of theAllRegsMask
struct they are operating. If the method takes a set of registers, those methods also need to know the register class of that mask, to determine which field of theAllRegsMask
it needs to update. The definition of these methods are present incompiler.hpp
.encodeForIndex()
anddecodeForIndex()
. Imagine adding mask of a register (1 << regNumber
) in the relevant field, here is how we could write:Alternatively, since both
gpr
andpredicate
register mask starts with bit 00x0
and hence can be directly added to_registers[0]
or_registers[2]
, we could rewrite it as following:Either way, we have to do a branch to add the mask to the relevant field. To make this code branch-free, for float register mask, I right shift it by 32 to fit it in 4-bytes using
encodeForIndex()
and while returning back the float register, usedecodeForIndex()
by left shifting it by 32-bits. With that, we can just do something like this:In LSRA, until now, we would use a single 64-bits primitive to represent register set. Whenever we want to extract each register corresponding to the bit ON in the set, we would iterate through it and return the next ON bit and toggle it. Following new methods are added to handle that aspect:
genFirstRegNumFromMaskAndToggle()
: WithAllRegsMask
, once we run out ofgpr/float
field, we need to iterate over the_predicateRegs
field.genRegNumFromMask()
: This method will now also take thetype
that it expects the register to extract and accordingly add64
to the extracted ON bit from the mask.genFirstRegNumFromMask()
: This too first scans thegpr/float
fields to see if anything is set and if not, will look into_predicateRegs
field.Lsra
regMaspTP
toregMaskOnlyOne
addRefsForPhysRegMask
) that takes mask containing registers of different register class (gpr/float/predicate) are now takingAllRegsMask
. They either pass through the mask to other methods, or iterate over all the registers present in the mask (and toggle them) using the newly addedgenFirstRegNumFromMaskAndToggle()
.AllRegsMask
instead to save the killMaskregMaskOnlyOne
as parameter, an extra parameter oftype
is passed to know what register class they represent, for further support updating the right fields ofAllRegsMask
.BuildDef*
methods: Added few new methods to group some of the logic around building definitions for callsBuildCallDefs()
/BuildCallDefsWithKills()
, the ones that just buildRefPosition
for killsBuildKills()
. Most of them takesAllRegsMask
as thekillMask
.RegisterType
field is removed fromRegRecord
andInterval
and moved it inside the parent classReferanceable
. This was done, so it is easy to query the type to determine the register class for a givenRefPosition
. Without this, we would have to check first if theRefPosition
represents virtual register (Interval
) or a physical register (RegRecord
).regMaskTP
toregMaskGpr
orregMaskFloat
.freeRegisters()
,verifyFreeRegisters()
,updateDeadCandidatesAtBlockStart()
,inActivateRegisters()
has been added. The original method will continue operating of the 64-bit mask (regMaskTP
), and the overloaded method operates onAllRegsMask
. The only difference between the two is how the bits are iterated and toggled inside the method.m_AvailableRegs
,placedArgRegs
,registersToDump
,m_RegistersWithConstants
,fixedRegs
,regsBusyUntilKill
,regsInUseThisLocation
,regsInUseNextLocation
is changed fromregMaskTP
toAllRegsMask
. The relevant methods that read/write these fields are updated to take theregisterType
as parameter. Based on that, it will add/remove the given register mask from theAllRegsMask
.m_AvailableRegs
, etc. now have to use the methods fromAllRegsMask
to add/remove/update the register/register mask. For that, we need to pass theregisterType
to those methods.regsToFree
,delayRegsToFree
,regsToMakeInactive
,delayRegsToMakeInactive
,copyRegsToFree
,targetRegsToDo
,targetRegsReady
,targetRegsFromStack
, etc. that tracks the registers during allocation/resolution are now changed toAllRegsMask
and so is the way they manipulate the add/removing of registers. They now use methods fromAllRegsMask
and sometimes need to passregisterType
to know the register class of themask
that is being added/removed.regMaskTP
type.killMask
.Codegen
regSet
using new methods onregSet
, based on the type.AllRegsMask
instead to save the killMaskgen(Save|Restore)CalleeSavedRegisterGroup
renamed the type of register mask fromregMaskTP
toregMaskOnlyOne
and it takes thetype
as additional parameter so it can pass along to other methods likegenBuildRegPairsStack
, etc.gen(Save|Restore)CalleeSavedRegistersHelp()
now takesAllRegsMask
instead ofregMaskTP
, because it has to also save/restore predicate registers. This method pass the individual register class mask togen(Save|Restore)CalleeSavedRegisterGroup
. Callers of this method basically extract theAllRegsMask
from theregSet
field to send all the callee saved registers. I might simplofy some of this to erase some of the changes.Compiler
RBM_*
masks that are today defined in varioustarget*.h
files, we need to have correspondingAllRegsMask_*
equivalent. Most of them rely on the availability offloat
registers and for AVX512, they are not known until we initialize the compiler object with CPU features. Hence these fields are defined after such initialization so they contain the accurate active register set, specially the float registers, needed for the compilation of the method.Misc
Most of the other changes mentioned below are minimal and are needed because the
regMaskTP
is renamed to one ofregMaskGpr
,regMaskFloat
, etc.Following files just has changes to the type names
AllRegsMask
inFuncletInfo
)AllRegsMask
)ID_EXTRA_BITFIELD_BITS
increased from 21 to 23 bits because we have twoREGNUM_BITS
ininstrDesc
)Following files add new parameter to categorize the register class
Added
REG_FP_COUNT
,REG_MASK_COUNT
andRBM_ALLGPR
We have
RegSet
class that tracks the registers touched and is used during codegen, return unused register or spill registers. To track all the different type of registers, I convertedrsModifiedRegsMask
fromregMaskTP
toAllRegsMask
. All the other methods that were changed was saving a particular register or set of registers (depending on the type) torsModifiedRegsMask
and returning back the register set for given type.Some of the methods in
GenTree*
now need to returnAllRegsMask
because the ABI might require it (I am certain that this can be justfloatgpr
, but had it for now):Refactoring
Handling of mask registers
Old TODO
This is just a prototype to see if the asserts added are hit or not.
TODO:
Make the size of. This will happen in follow-up PR.regMask*
that this PR introduces to 4 bytes instead of 8 bytesReport the memory savings from changing 8 bytes -> 4 bytesNAConvert all theRBM_*
to represent 4 bytes i.e.VMASK
will change to be no-opRBM_ALLFLOAT
/RBM_ALLDOUBLE
might not be relevant anymore. So will need to remove/tweak code that relies on it.AllRegsMask()
depending on if method has just int, int/float, int/float/predicateAllRegsMask()
can be merged without TP impact. Edit: It should go all together.AllRegsMask()
to array would improve? We can have a prepopulated map of register toindex
ofAllRegMask()
the register should touch if it needs to be added/removed/trackedAllRegsMask
that contains a single64-bit
mask field and all the operations on it would be done either on low 32-bits (gpr) or high 32-bits. There can be a structAllRegsMaskWithPredicate
that inherits fromAllRegsMask
and just has a 32-bits field for predicate registers. Places that mostly deal with gpr/float can useAllRegsMask
. Edit: We changed the design to useAllRegsMask
instead.checked
performance is not too much impacted. Most likely when we convert theregMaskGpr
, etc. to 4 bytes, all of the asserts will be gone.HAS_PREDICATE_REGS
withFEATURE_MASKED_HW_INTRINSICS
Fixes: #99658