-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Consider to enable FEATURE_MULTICASTSTUB_AS_IL for Windows x86 #103958
Comments
@huoyaoyuan Could you please share the source for the microbenchmarks (I do not see it at https://github.com/search?q=repo%3Adotnet%2Fperformance%20ManyCast&type=code), the current code, and the baseline/pr disassembly for the one with worst regression? |
Benchmark code: public struct GCStruct
{
public object a, b, c, d, e, f, h, g;
}
public class Program
{
static void Main(string[] args)
{
BenchmarkSwitcher
.FromAssembly(typeof(Program).Assembly)
.Run(args);
}
private readonly object obj = new object();
private readonly Guid guid = Guid.NewGuid();
private readonly GCStruct gcStruct = new GCStruct
{
a = new object(),
g = new object()
};
private readonly Action<int> singleArg = (Action<int>)delegate { } + delegate { };
private readonly Action<int> manyCast = (Action<int>)delegate { } +
delegate { } + delegate { } + delegate { } + delegate { } + delegate { };
private readonly Func<int, double, Guid, GCStruct, object, double> manyArg_RetFPU = (Func<int, double, Guid, GCStruct, object, double>)
delegate { return 123.456; } + delegate { return 654.321; };
[Benchmark]
public void SingleArg() => singleArg(42);
[Benchmark]
public void ManyCast() => manyCast(42);
[Benchmark]
public double ManyArg_RetFPU() => manyArg_RetFPU(42, 123.0, guid, gcStruct, obj);
} It's not straightforward to disassemble il stub method with BenchmarkDotNet. I'll try to add them later. |
Dasm for PR (IL stub): ; Assembly listing for method System.Action`1[int]:IL_STUB_MulticastDelegate_Invoke(int):this (FullOpts)
; Emitting BLENDED_CODE for X86 with AVX - Windows
; FullOpts code
; optimized code
; optimized using Synthesized PGO
; ebp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data
G_M000_IG01: ;; offset=0x0000
push ebp
mov ebp, esp
push edi
push esi
push ebx
sub esp, 12
mov esi, ecx
mov edi, edx
G_M000_IG02: ;; offset=0x000D
mov ebx, dword ptr [esi+0x18]
mov dword ptr [ebp-0x10], ebx
xor eax, eax
jmp SHORT G_M000_IG04
G_M000_IG03: ;; offset=0x0017
mov edx, gword ptr [esi+0x14]
cmp eax, dword ptr [edx+0x04]
jae SHORT G_M000_IG06
mov dword ptr [ebp-0x14], eax
mov ecx, gword ptr [edx+4*eax+0x08]
mov gword ptr [ebp-0x18], ecx
mov edx, edi
mov ecx, gword ptr [ecx+0x04]
mov ebx, gword ptr [ebp-0x18]
call [ebx+0x0C]System.Action`1[int]:Invoke(int):this
mov ebx, dword ptr [ebp-0x14]
inc ebx
mov eax, ebx
mov ebx, dword ptr [ebp-0x10]
G_M000_IG04: ;; offset=0x003D
mov ecx, esi
mov dword ptr [ebp-0x14], eax
mov edx, eax
call System.StubHelpers.StubHelpers:MulticastDebuggerTraceHelper(System.Object,int)
mov eax, dword ptr [ebp-0x14]
cmp eax, ebx
jne SHORT G_M000_IG03
G_M000_IG05: ;; offset=0x0050
lea esp, [ebp-0x0C]
pop ebx
pop esi
pop edi
pop ebp
ret
G_M000_IG06: ;; offset=0x0058
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code 94 |
Dasm for main: runtime/src/coreclr/vm/i386/stublinkerx86.cpp Line 3794 in eb29166
mov eax, [ecx + offsetof(_methodAuxPtr)]
; EmitMethodStubProlog(MulticastFrame::GetMethodFrameVPtr(), MulticastFrame::GetOffsetOfTransitionBlock())
push ebp ;; save callee-saved register
mov ebp,esp
push ebx ;; save callee-saved register
push esi ;; save callee-saved register
push edi ;; save callee-saved register
; Push & initialize ArgumentRegisters
push ecx
push edx
push eax ; Push m_datum
push edx ; push edx ;leave room for m_next (edx is an arbitrary choice)
push MulticastFrame__GetMethodFrameVPtr ; push Frame vptr
mov esi, esp
mov ebx, __Thread
mov edi,[ebx + Thread.GetFrame()] ;; get previous frame
mov [esi + Frame.m_next], edi
mov [ebx + Thread.GetFrame()], esi
; Patch label
push edi ;; Save EDI (want to use it as loop index)
xor edi,edi ;; Loop counter: EDI=0,1,2...
LOOP:
mov ecx, [esi + thisRegOffset] ;; get delegate
cmp edi,[ecx + Delegate___invocationCount]
je ENDLOOP
;; push [ESI + offset__stackArg], no stack arg for Action<int>
mov edx, [esi + offset__savedEDX]
mov eax, [ecx + Delegate___invocationCount] ;;fetch invocation list
mov eax, [eax + array__SzArrayData + edi*4] ;; index into invocation list
mov ecx, [eax + Delegate__target] ;;replace "this" pointer
call [eax + Delegate__methodPtr] ;; call current subscriber
; nop in DEBUG
inc edi
; FP ret handling
; Patch label
jmp LOOP
ENDLOOP:
pop edi
; CheckGSCookie in DEBUG
; EmitMethodStubEpilog(numStackBytes, MulticastFrame::GetOffsetOfTransitionBlock()), numStackBytes=0
mov [ebx + Thread.GetFrame()], edi ;; restore previous frame
add esp, sizeof(GSCookie) + transitionBlockOffset + TransitionBlock::GetOffsetOfCalleeSavedRegisters() ; deallocate Frame
pop edi
pop esi
pop ebx
pop ebp
ret 0 |
Some analysis:
|
The biggest impact is from |
CC @TIHan. |
These issues probably do not matter a whole lot, and they should be fixable by tweaking the IL if necessary.
We may be able to emit the call conditionally only when the debugger expects it to happen. Check a value of a global flag and skip the call if the flag is zero. @tommcdon @noahfalk Do we have a global flag that says whether the debugger is trying to trace towards a stub trace destination? If not, what would it take it to add it? |
Adding @hoyosjs . The easiest thing to check would be whether or not a debugger is actually attached to the process. If we want to get more granular than that, the debugger effectively has a state machine helps the stepper step into / over / out of stubs. I'm not aware of an easy check to determine that the debugger is actively stepping through a stub, though it seems reasonable that we could either build a heuristic that could answer this question or add some global state information that makes it easier to determine. So my guess is that we could add some global flag that can answer the question whether or not a debugger step-into, step-out, or a step-over is in progress. |
Sounds good. The fix should be to emit a conditional jump around runtime/src/coreclr/vm/comdelegate.cpp Lines 2191 to 2195 in 31e6c11
@huoyaoyuan Would you like to give this a try? |
Make sure I'm understanding: the check needs to happen at run time. The address of the global flag can be embedded into IL. Should it be a separated PR to verify impact on x86/arm64 first? |
Right
Either way is fine with me. |
After some fine tuning, I can consistently get better performance than main:
Branch prediction is also crucial here. Sample codegen for ; Assembly listing for method System.Action:IL_STUB_MulticastDelegate_Invoke():this (FullOpts)
; Emitting BLENDED_CODE for X86 with AVX - Windows
; FullOpts code
; optimized code
; optimized using Synthesized PGO
; ebp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data
G_M000_IG01: ;; offset=0x0000
push ebp
mov ebp, esp
push edi
push esi
mov esi, ecx
G_M000_IG02: ;; offset=0x0007
xor edi, edi
jmp SHORT G_M000_IG04
G_M000_IG03: ;; offset=0x000B
mov ecx, gword ptr [esi+0x14]
cmp edi, dword ptr [ecx+0x04]
jae SHORT G_M000_IG08
mov eax, gword ptr [ecx+4*edi+0x08]
mov ecx, gword ptr [eax+0x04]
call [eax+0x0C]System.Action:Invoke():this
inc edi
G_M000_IG04: ;; offset=0x001E
test dword ptr [0x79558E20], 512
jne SHORT G_M000_IG06
G_M000_IG05: ;; offset=0x002A
cmp edi, dword ptr [esi+0x18]
je SHORT G_M000_IG07
jmp SHORT G_M000_IG03
G_M000_IG06: ;; offset=0x0031
mov ecx, esi
mov edx, edi
call System.StubHelpers.StubHelpers:MulticastDebuggerTraceHelper(System.Object,int)
jmp SHORT G_M000_IG05
G_M000_IG07: ;; offset=0x003C
pop esi
pop edi
pop ebp
ret
G_M000_IG08: ;; offset=0x0040
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code 70 |
Since the initial publish of coreclr, FEATURE_MULTICASTSTUB_AS_IL has been enabled on all platforms except Windows x86. Similar to #103533, the IL method stubs are easier to maintain.
Currently the IL version is functional for Windows x86, but performs worse:
Consider to improve the codegen in JIT for better performance.
The text was updated successfully, but these errors were encountered: