-
Notifications
You must be signed in to change notification settings - Fork 5k
Abolish PSPSym from ABI #114630
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Abolish PSPSym from ABI #114630
Conversation
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch |
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
Diffs look favorable: ARM64 has places where it stopped using vector registers for memory copying. Not sure if we can tweak the alignment to fix this but it seems plausible. /cc @jkotas |
What's an example of a method with this diff? |
211353.dasm - System.Collections.Immutable.ImmutableHashSet`1[System.__Canon]:System.Collections.Generic.ICollection.CopyTo(System.__Canon[],int):this (FullOpts) @@ -9,10 +9,10 @@
; 0 inlinees with PGO data; 3 single block inlinees; 4 inlinees without PGO data
; Final local variable assignments
;
-; V00 this [V00,T13] ( 6, 5 ) ref -> [fp+0x18] this class-hnd EH-live single-def <System.Collections.Immutable.ImmutableHashSet`1[System.__Canon]>
+; V00 this [V00,T13] ( 6, 5 ) ref -> [fp+0x10] this class-hnd EH-live single-def <System.Collections.Immutable.ImmutableHashSet`1[System.__Canon]>
; V01 arg1 [V01,T07] ( 5, 8 ) ref -> x20 class-hnd single-def <System.__Canon[]>
; V02 arg2 [V02,T05] ( 6, 12 ) int -> registers
-; V03 loc0 [V03 ] ( 11, 26 ) struct (128) [fp+0xE0] do-not-enreg[XSF] must-init addr-exposed ld-addr-op <System.Collections.Immutable.ImmutableHashSet`1+Enumerator[System.__Canon]>
+; V03 loc0 [V03 ] ( 11, 26 ) struct (128) [fp+0xD8] do-not-enreg[XSF] must-init addr-exposed ld-addr-op <System.Collections.Immutable.ImmutableHashSet`1+Enumerator[System.__Canon]>
; V04 loc1 [V04,T09] ( 2, 8 ) ref -> x2 class-hnd <System.__Canon>
;# V05 OutArgs [V05 ] ( 1, 1 ) struct ( 0) [sp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
;* V06 tmp1 [V06 ] ( 0, 0 ) int -> zero-ref "impAppendStmt"
@@ -20,9 +20,9 @@
; V08 tmp3 [V08,T17] ( 4, 4 ) long -> x1 "Indirect call through function pointer"
;* V09 tmp4 [V09 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
;* V10 tmp5 [V10 ] ( 0, 0 ) ubyte -> zero-ref "Inlining Arg"
-; V11 tmp6 [V11,T15] ( 3, 6 ) struct (128) [fp+0x60] do-not-enreg[SF] must-init ld-addr-op "NewObj constructor temp" <System.Collections.Immutable.ImmutableHashSet`1+Enumerator[System.__Canon]>
+; V11 tmp6 [V11,T15] ( 3, 6 ) struct (128) [fp+0x58] do-not-enreg[SF] must-init ld-addr-op "NewObj constructor temp" <System.Collections.Immutable.ImmutableHashSet`1+Enumerator[System.__Canon]>
; V12 tmp7 [V12,T18] ( 2, 4 ) ref -> x22 class-hnd exact single-def "Inlining Arg" <System.Collections.Immutable.SortedInt32KeyNode`1[System.Collections.Immutable.ImmutableHashSet`1+HashBucket[System.__Canon]]>
-; V13 tmp8 [V13 ] ( 3, 6 ) struct (32) [fp+0x40] do-not-enreg[XS] must-init addr-exposed ld-addr-op "NewObj constructor temp" <System.Collections.Immutable.SortedInt32KeyNode`1+Enumerator[System.Collections.Immutable.ImmutableHashSet`1+HashBucket[System.__Canon]]>
+; V13 tmp8 [V13 ] ( 3, 6 ) struct (32) [fp+0x38] do-not-enreg[XS] must-init addr-exposed ld-addr-op "NewObj constructor temp" <System.Collections.Immutable.SortedInt32KeyNode`1+Enumerator[System.Collections.Immutable.ImmutableHashSet`1+HashBucket[System.__Canon]]>
; V14 tmp9 [V14,T19] ( 2, 4 ) long -> x0 "Inlining Arg"
;* V15 tmp10 [V15 ] ( 0, 0 ) byref -> zero-ref
;* V16 tmp11 [V16 ] ( 0, 0 ) byref -> zero-ref
@@ -36,16 +36,15 @@
; V24 tmp19 [V24,T00] ( 2, 32 ) long -> x1 "argument with side effect"
; V25 tmp20 [V25,T11] ( 2, 8 ) long -> x1 "argument with side effect"
; V26 tmp21 [V26,T10] ( 2, 8 ) ref -> x26 "argument with side effect"
-; V27 tmp22 [V27 ] ( 2, 8 ) struct (32) [fp+0x20] do-not-enreg[XS] must-init addr-exposed "by-value struct argument" <System.Collections.Immutable.SortedInt32KeyNode`1+Enumerator[System.Collections.Immutable.ImmutableHashSet`1+HashBucket[System.__Canon]]>
+; V27 tmp22 [V27 ] ( 2, 8 ) struct (32) [fp+0x18] do-not-enreg[XS] must-init addr-exposed "by-value struct argument" <System.Collections.Immutable.SortedInt32KeyNode`1+Enumerator[System.Collections.Immutable.ImmutableHashSet`1+HashBucket[System.__Canon]]>
; V28 tmp23 [V28,T12] ( 2, 8 ) long -> x0 "argument with side effect"
; V29 tmp24 [V29,T04] ( 2, 16 ) long -> x1 "argument with side effect"
; V30 cse0 [V30,T06] ( 3, 13 ) long -> x22 hoist "CSE #03: aggressive"
; V31 cse1 [V31,T16] ( 7, 5 ) long -> x21 multi-def "CSE #01: moderate"
; V32 cse2 [V32,T14] ( 3, 6 ) ref -> x26 "CSE #05: aggressive"
; V33 cse3 [V33,T08] ( 2, 9 ) long -> x23 hoist "CSE #04: aggressive"
-; V34 rat0 [V34 ] ( 1, 1 ) long -> [fp+0x168] do-not-enreg[V] "PSPSym"
;
-; Lcl frame size = 352
+; Lcl frame size = 336
G_M35275_IG01: ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref, nogc <-- Prolog IG
stp fp, lr, [sp, #0xD1FFAB1E]!
@@ -55,20 +54,19 @@ G_M35275_IG01: ; bbWeight=1, gcVars=0000000000000000 {}, gcrefRegs=0000 {
stp x25, x26, [sp, #0xD1FFAB1E]
mov fp, sp
movi v16.16b, #0
- mov x9, fp
+ sub x9, fp, #8
mov x10, #0xD1FFAB1E
stp q16, q16, [x9, #0x20]
stp q16, q16, [x9, #0x40]!
subs x10, x10, #64
bge pc-16 (-4 instructions)
- add x3, sp, #0xD1FFAB1E
- stp x0, x3, [fp, #0xD1FFAB1E] // [V34 rat0]
- str x0, [fp, #0x18] // [V00 this]
+ str x0, [fp, #0xD1FFAB1E]
+ str x0, [fp, #0x10] // [V00 this]
; GC ptr vars +{V00}
mov x20, x1
; gcrRegs +[x20]
mov w19, w2
- ;; size=72 bbWeight=1 PerfScore 14.00
+ ;; size=68 bbWeight=1 PerfScore 13.50
G_M35275_IG02: ; bbWeight=1, gcVars=0000000000002000 {V00}, gcrefRegs=100001 {x0 x20}, byrefRegs=0000 {}, gcvars, byref, isz
; gcrRegs +[x0]
ldr x21, [x0]
@@ -94,7 +92,7 @@ G_M35275_IG02: ; bbWeight=1, gcVars=0000000000002000 {V00}, gcrefRegs=100
; gcr arg pop 0
tbnz w19, #31, G_M35275_IG17
ldr w0, [x20, #0x08]
- ldr x1, [fp, #0x18] // [V00 this]
+ ldr x1, [fp, #0x10] // [V00 this]
; gcrRegs +[x1]
ldr w11, [x1, #0x20]
add w11, w19, w11
@@ -107,12 +105,12 @@ G_M35275_IG02: ; bbWeight=1, gcVars=0000000000002000 {V00}, gcrefRegs=100
blr xip0
; gcrRegs -[x1]
; gcr arg pop 0
- ldr x1, [fp, #0x18] // [V00 this]
+ ldr x1, [fp, #0x10] // [V00 this]
; gcrRegs +[x1]
ldr x22, [x1, #0x10]
; gcrRegs +[x22]
- stp xzr, xzr, [fp, #0x40]
- stp xzr, xzr, [fp, #0x50]
+ stp xzr, xzr, [fp, #0x38]
+ stp xzr, xzr, [fp, #0x48]
adrp x11, [HIGH RELOC #0xD1FFAB1E] // function address
add x11, x11, [LOW RELOC #0xD1FFAB1E]
ldr xip0, [x11]
@@ -120,7 +118,7 @@ G_M35275_IG02: ; bbWeight=1, gcVars=0000000000002000 {V00}, gcrefRegs=100
; gcrRegs -[x1]
; gcr arg pop 0
mov x1, x0
- add x0, fp, #64 // [V13 tmp8]
+ add x0, fp, #56 // [V13 tmp8]
mov x2, x22
; gcrRegs +[x2]
adrp x11, [HIGH RELOC #0xD1FFAB1E] // function address
@@ -131,29 +129,36 @@ G_M35275_IG02: ; bbWeight=1, gcVars=0000000000002000 {V00}, gcrefRegs=100
; gcr arg pop 0
;; size=168 bbWeight=1 PerfScore 56.00
G_M35275_IG03: ; bbWeight=1, nogc, extend
- ldp q16, q17, [fp, #0x40]
- stp q16, q17, [fp, #0x70]
- ;; size=8 bbWeight=1 PerfScore 3.00
+ ldp x0, x1, [fp, #0x38]
+ stp x0, x1, [fp, #0x68]
+ ldp x0, x1, [fp, #0x48]
+ stp x0, x1, [fp, #0x78]
+ ;; size=16 bbWeight=1 PerfScore 8.00
G_M35275_IG04: ; bbWeight=1, extend
- movi v16.16b, #0
- stp q16, q16, [fp, #0x90]
- stp q16, q16, [fp, #0xB0]
- str q16, [fp, #0xD0]
- ;; size=16 bbWeight=1 PerfScore 3.50
+ stp xzr, xzr, [fp, #0x88]
+ stp xzr, xzr, [fp, #0x98]
+ stp xzr, xzr, [fp, #0xA8]
+ stp xzr, xzr, [fp, #0xB8]
+ stp xzr, xzr, [fp, #0xC8]
+ ;; size=20 bbWeight=1 PerfScore 5.00
G_M35275_IG05: ; bbWeight=1, nogc, extend
+ ldr x0, [fp, #0x58]
+ str x0, [fp, #0xD8]
ldp q16, q17, [fp, #0x60]
stp q16, q17, [fp, #0xE0]
ldp q16, q17, [fp, #0x80]
stp q16, q17, [fp, #0xD1FFAB1E]
ldp q16, q17, [fp, #0xA0]
stp q16, q17, [fp, #0xD1FFAB1E]
- ldp q16, q17, [fp, #0xC0]
- stp q16, q17, [fp, #0xD1FFAB1E]
- ;; size=32 bbWeight=1 PerfScore 12.00
+ ldr q16, [fp, #0xC0]
+ str q16, [fp, #0xD1FFAB1E]
+ ldr x0, [fp, #0xD0]
+ str x0, [fp, #0xD1FFAB1E]
+ ;; size=48 bbWeight=1 PerfScore 18.00
G_M35275_IG06: ; bbWeight=1, extend
- str xzr, [fp, #0xE0] // [V03 loc0]
+ str xzr, [fp, #0xD8] // [V03 loc0]
movn w0, #0
- str w0, [fp, #0xE8] // [V03 loc0+0x08]
+ str w0, [fp, #0xE0] // [V03 loc0+0x08]
;; size=12 bbWeight=1 PerfScore 2.50
G_M35275_IG07: ; bbWeight=1, gcrefRegs=100000 {x20}, byrefRegs=0000 {}, byref
mov x0, x21
@@ -176,13 +181,13 @@ G_M35275_IG08: ; bbWeight=4, gcrefRegs=100000 {x20}, byrefRegs=0000 {}, b
blr xip0
; gcr arg pop 0
mov x25, x0
- ldr x0, [fp, #0xF0] // [V03 loc0+0x10]
+ ldr x0, [fp, #0xE8] // [V03 loc0+0x10]
; gcrRegs +[x0]
cbz x0, G_M35275_IG10
;; size=36 bbWeight=4 PerfScore 38.00
G_M35275_IG09: ; bbWeight=2, gcrefRegs=100000 {x20}, byrefRegs=0000 {}, byref, isz
; gcrRegs -[x0]
- ldr x26, [fp, #0xF8] // [V03 loc0+0x18]
+ ldr x26, [fp, #0xF0] // [V03 loc0+0x18]
; gcrRegs +[x26]
cbz x26, G_M35275_IG13
mov x0, x25
@@ -194,7 +199,7 @@ G_M35275_IG09: ; bbWeight=2, gcrefRegs=100000 {x20}, byrefRegs=0000 {}, b
mov x1, x0
mov x0, x26
; gcrRegs +[x0]
- add x2, fp, #240 // [V03 loc0+0x10]
+ add x2, fp, #232 // [V03 loc0+0x10]
adrp x11, [HIGH RELOC #0xD1FFAB1E] // function address
add x11, x11, [LOW RELOC #0xD1FFAB1E]
ldr wzr, [x0]
@@ -213,11 +218,13 @@ G_M35275_IG10: ; bbWeight=2, gcrefRegs=100000 {x20}, byrefRegs=0000 {}, b
; gcr arg pop 0
;; size=20 bbWeight=2 PerfScore 11.00
G_M35275_IG11: ; bbWeight=2, nogc, extend
- ldp q16, q17, [fp, #0xF0]
- stp q16, q17, [fp, #0x20]
- ;; size=8 bbWeight=2 PerfScore 6.00
+ ldp x1, x2, [fp, #0xE8]
+ stp x1, x2, [fp, #0x18]
+ ldp x1, x2, [fp, #0xF8]
+ stp x1, x2, [fp, #0x28]
+ ;; size=16 bbWeight=2 PerfScore 16.00
G_M35275_IG12: ; bbWeight=2, extend
- add x1, fp, #32 // [V27 tmp22]
+ add x1, fp, #24 // [V27 tmp22]
adrp x11, [HIGH RELOC #0xD1FFAB1E] // function address
add x11, x11, [LOW RELOC #0xD1FFAB1E]
ldr xip0, [x11]
@@ -255,7 +262,7 @@ G_M35275_IG13: ; bbWeight=4, gcrefRegs=100000 {x20}, byrefRegs=0000 {}, b
;; size=84 bbWeight=4 PerfScore 78.00
G_M35275_IG14: ; bbWeight=8, gcrefRegs=100000 {x20}, byrefRegs=0000 {}, byref, isz
mov x1, x22
- add x0, fp, #224 // [V03 loc0]
+ add x0, fp, #216 // [V03 loc0]
mov x11, x23
ldr xip0, [x11]
blr xip0
@@ -271,7 +278,7 @@ G_M35275_IG15: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
blr xip0
; gcr arg pop 0
mov x1, x0
- add x0, fp, #224 // [V03 loc0]
+ add x0, fp, #216 // [V03 loc0]
blr x1
; gcr arg pop 0
;; size=32 bbWeight=1 PerfScore 7.50
@@ -300,16 +307,14 @@ G_M35275_IG17: ; bbWeight=0.50, gcVars=0000000000002000 {V00}, gcrefRegs=
brk #0
;; size=40 bbWeight=0.50 PerfScore 6.75
G_M35275_IG18: ; bbWeight=0, gcVars=0000000000002000 {V00}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref, funclet prolog, nogc
- stp fp, lr, [sp, #-0x60]!
- stp x19, x20, [sp, #0x20]
- stp x21, x22, [sp, #0x30]
- stp x23, x24, [sp, #0x40]
- stp x25, x26, [sp, #0x50]
- add x3, fp, #0xD1FFAB1E
- str x3, [sp, #0x18]
- ;; size=28 bbWeight=0 PerfScore 0.00
+ stp fp, lr, [sp, #-0x50]!
+ stp x19, x20, [sp, #0x10]
+ stp x21, x22, [sp, #0x20]
+ stp x23, x24, [sp, #0x30]
+ stp x25, x26, [sp, #0x40]
+ ;; size=20 bbWeight=0 PerfScore 0.00
G_M35275_IG19: ; bbWeight=0, gcVars=0000000000002000 {V00}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
- ldr x1, [fp, #0x18] // [V00 this]
+ ldr x1, [fp, #0x10] // [V00 this]
; gcrRegs +[x1]
ldr x21, [x1]
mov x0, x21
@@ -320,20 +325,20 @@ G_M35275_IG19: ; bbWeight=0, gcVars=0000000000002000 {V00}, gcrefRegs=000
; gcrRegs -[x1]
; gcr arg pop 0
mov x1, x0
- add x0, fp, #224 // [V03 loc0]
+ add x0, fp, #216 // [V03 loc0]
blr x1
; gcr arg pop 0
;; size=40 bbWeight=0 PerfScore 0.00
G_M35275_IG20: ; bbWeight=0, funclet epilog, nogc, extend
- ldp x25, x26, [sp, #0x50]
- ldp x23, x24, [sp, #0x40]
... |
/azp run runtime-coreclr gcstress0x3-gcstress0xc, runtime-coreclr gcstress-extra |
Azure Pipelines successfully started running 2 pipeline(s). |
} | ||
|
||
genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta() - osrPad; | ||
|
||
regMaskTP rsMaskSaveRegs = regSet.rsMaskCalleeSaved; | ||
assert((rsMaskSaveRegs & RBM_LR) != 0); | ||
assert((rsMaskSaveRegs & RBM_FP) != 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there still a dependency on the funclet prolog to be the same as main method prolog with PSP removed?
Can this be changed to just rsMaskSaveRegs = RBM_LR | RBM_FP;
to avoid saving all registers that main method saves?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My gut says that there's no dependency. There are few more places that are likely possible to optimize (#114630 (comment)). I'm torn between trying to address it here or separately. Even if the codegen is not perfect it was already somewhat tested by NativeAOT ABI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is fine to do the additional cleanups in a follow up.
Windows x64 deprecated the frame pointers (BP:0 chains) and used the freed-up register to generate a bit better code. I believe PSPSym was a replacement of frame pointer for EH in this spirit. The codegen does not need to save and restore a frame pointer in the main method body with PSPSym. Majority of the cost to deal with PSPSym (really just a couple instructions here and there) can be shifted into the funclet that is expected to be a non-critical path. |
…sults in unwinding information not matching the code and error when unwinding inside the funclet epilog" This reverts commit 3bee83b.
/azp run runtime-coreclr gcstress0x3-gcstress0xc, runtime-coreclr gcstress-extra |
Azure Pipelines successfully started running 2 pipeline(s). |
More fine-grained ifdefs do not seem to be a good idea. See conversation at #113888 (comment)
My recommendation would be:
|
Summary of the latest GC stress failures before Azure drops the logs on next push:
|
The delta looks good to me. |
/azp run runtime-nativeaot-outerloop |
Azure Pipelines successfully started running 1 pipeline(s). |
The NativeAOT build failures are known issue, proposed fix is in #114764. Rest of the tests passed. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you!
Great work! |
I wonder if we should have changed the JIT/EE GUID: presumably a new JIT that doesn't emit PSPSym won't work with an old VM (which doesn't restore the right registers). Presumably an old JIT will work with a new VM, since the GC info where PSPSym is reported still exists. |
Yes, I think you are right. This should have change JIT/EE GUID. |
NativeAOT was already using an ABI without PSPSym and this aligns CoreCLR to match. The aim is to get some improvements in code quality and also get better testing for the code generation by unifying it across NativeAOT and CoreCLR and thus getting better GC stress coverage. Eventually it opens path to share more code between the two runtimes.
Removes any use of PSPSym from JIT, both for emitting and consumption.
The only usage of PSPSym in the VM was in
GetExactGenericsToken
. The encoding of generics instance context stack slot is changed to be SP/FP relative. The profiler code is adjusted accordingly.