-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JIT] Performance bug Windows vs Linux #71311
Comments
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch Issue DetailsDescriptionThe The JIT also emitted different assembly. See below the code of the simplest repro I could make, which is a synthetic benchmark. I did a similar change on my ASP.NET Core 6.0 application, with similar conclusions. I consider this a performance bug. Please let me know if I can provide further assistance. using System.Runtime.InteropServices;
namespace MyNamespace;
[StructLayout(LayoutKind.Sequential, Pack = 1)]
public readonly struct Triplet
{
public int A { get; }
public int B { get; }
public int C { get; }
public Triplet(int a, int b, int c)
{
A = a;
B = b;
C = c;
}
public static readonly Triplet Max = new Triplet(int.MaxValue, int.MaxValue, int.MaxValue);
public bool IsMax => A == int.MaxValue;
public static Triplet Plus(in Triplet left, in Triplet right)
{
if (left.IsMax || right.IsMax)
{
return Max;
}
checked
{
int a = left.A + right.A;
int b = left.B + right.B;
int c = left.C + right.C;
return new Triplet(a, b, c);
}
}
public static void PlusOut(in Triplet left, in Triplet right, out Triplet result)
{
if (left.IsMax || right.IsMax)
{
result = Max;
return;
}
checked
{
int a = left.A + right.A;
int b = left.B + right.B;
int c = left.C + right.C;
result = new Triplet(a, b, c);
}
}
}
using BenchmarkDotNet.Attributes;
[DisassemblyDiagnoser(printSource: true, exportGithubMarkdown: true, exportDiff: true)]
public class CrpTravelCostBenchmark
{
private readonly Triplet tripletA = new(100, 100, 0);
private readonly Triplet tripletB = new(2340, 100, 0);
[Benchmark(Baseline = true)]
public bool TripletPlus()
{
return Triplet.Plus(tripletA, tripletB).A < 5000;
}
[Benchmark]
public bool TripletPlusOut()
{
Triplet.PlusOut(tripletA, tripletB, out Triplet temp);
return temp.A < 5000;
}
} ConfigurationThe Windows data I gather by running on my laptop. The Ubuntu data was gathered on an Azure VM (Standard D4ds v5). For more precide configuration information consult the 'Data' section. RegressionI haven't tried any other versions of .NET. DataWindows
Ubuntu
AnalysisWindows.NET 6.0.6 (6.0.622.26707), X64 RyuJIT ; MapService.CrpBenchmark.CrpTravelCostBenchmark.TripletPlus()
sub rsp,38
xor eax,eax
mov [rsp+28],rax
mov [rsp+30],rax
cmp [rcx],ecx
lea rdx,[rcx+8]
lea r8,[rcx+18]
lea rcx,[rsp+28]
call MyNamespace.Triplet.Plus(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
cmp dword ptr [rsp+28],1388
setl al
movzx eax,al
add rsp,38
ret
; Total bytes of code 55 ; MyNamespace.Triplet.Plus(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
push rdi
push rsi
push rbx
sub rsp,20
mov rsi,rcx
mov edi,[rdx]
cmp edi,7FFFFFFF
je short M01_L00
mov ebx,[r8]
cmp ebx,7FFFFFFF
jne short M01_L01
M01_L00:
mov rcx,7FF7C7934FC8
mov edx,16
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
mov rax,1F033C37350
mov rax,[rax]
mov rdx,[rax+8]
mov [rsi],rdx
mov edx,[rax+10]
mov [rsi+8],edx
mov rax,rsi
add rsp,20
pop rbx
pop rsi
pop rdi
ret
M01_L01:
add edi,ebx
jo short M01_L02
mov eax,[rdx+4]
add eax,[r8+4]
jo short M01_L02
mov edx,[rdx+8]
add edx,[r8+8]
jo short M01_L02
mov [rsi],edi
mov [rsi+4],eax
mov [rsi+8],edx
mov rax,rsi
add rsp,20
pop rbx
pop rsi
pop rdi
ret
M01_L02:
call CORINFO_HELP_OVERFLOW
int 3
; Total bytes of code 135 .NET 6.0.6 (6.0.622.26707), X64 RyuJIT ; MapService.CrpBenchmark.CrpTravelCostBenchmark.TripletPlusOut()
sub rsp,38
xor eax,eax
mov [rsp+28],rax
mov [rsp+30],rax
cmp [rcx],ecx
mov [rsp+40],rcx
add rcx,8
mov rdx,[rsp+40]
add rdx,18
lea r8,[rsp+28]
call MyNamespace.Triplet.PlusOut(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
cmp dword ptr [rsp+28],1388
setl al
movzx eax,al
add rsp,38
ret
; Total bytes of code 65 ; MyNamespace.Triplet.PlusOut(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
push rdi
push rsi
push rbx
sub rsp,20
mov rsi,r8
mov edi,[rcx]
cmp edi,7FFFFFFF
je short M01_L00
mov ebx,[rdx]
cmp ebx,7FFFFFFF
jne short M01_L01
M01_L00:
mov rcx,7FF7C7914FC8
mov edx,16
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
mov rax,16818E97350
mov rax,[rax]
mov rdx,[rax+8]
mov [rsi],rdx
mov edx,[rax+10]
mov [rsi+8],edx
add rsp,20
pop rbx
pop rsi
pop rdi
ret
M01_L01:
add edi,ebx
jo short M01_L02
mov eax,[rcx+4]
add eax,[rdx+4]
jo short M01_L02
mov ecx,[rcx+8]
add ecx,[rdx+8]
jo short M01_L02
mov [rsi],edi
mov [rsi+4],eax
mov [rsi+8],ecx
add rsp,20
pop rbx
pop rsi
pop rdi
ret
M01_L02:
call CORINFO_HELP_OVERFLOW
int 3
; Total bytes of code 126 Ubuntu.NET 6.0.6 (6.0.622.26707), X64 RyuJIT ; MapService.CrpBenchmark.CrpTravelCostBenchmark.TripletPlus()
sub rsp,18
cmp [rdi],edi
mov [rsp],rdi
add rdi,8
mov rsi,[rsp]
add rsi,18
call MyNamespace.Triplet.Plus(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
mov [rsp+8],rax
mov [rsp+10],edx
cmp dword ptr [rsp+8],1388
setl al
movzx eax,al
add rsp,18
ret
; Total bytes of code 55 ; MyNamespace.Triplet.Plus(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
push rbp
push r14
push rbx
sub rsp,20
lea rbp,[rsp+30]
mov ebx,[rdi]
cmp ebx,7FFFFFFF
je short M01_L00
mov r14d,[rsi]
cmp r14d,7FFFFFFF
jne short M01_L01
M01_L00:
mov rdi,7F945206D5E0
mov esi,16
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
mov rax,7F94340053D0
mov rax,[rax]
mov rdi,[rax+8]
mov [rbp-20],rdi
mov edi,[rax+10]
mov [rbp-18],edi
mov rax,[rbp-20]
mov edx,[rbp-18]
add rsp,20
pop rbx
pop r14
pop rbp
ret
M01_L01:
add ebx,r14d
jo short M01_L02
mov eax,[rdi+4]
add eax,[rsi+4]
jo short M01_L02
mov edi,[rdi+8]
add edi,[rsi+8]
jo short M01_L02
xor esi,esi
mov [rbp-30],rsi
mov [rbp-28],esi
mov [rbp-30],ebx
mov [rbp-2C],eax
mov [rbp-28],edi
mov rax,[rbp-30]
mov edx,[rbp-28]
add rsp,20
pop rbx
pop r14
pop rbp
ret
M01_L02:
call CORINFO_HELP_OVERFLOW
int 3
; Total bytes of code 159 .NET 6.0.6 (6.0.622.26707), X64 RyuJIT ; MapService.CrpBenchmark.CrpTravelCostBenchmark.TripletPlusOut()
sub rsp,18
xor eax,eax
mov [rsp+8],rax
mov [rsp+10],rax
cmp [rdi],edi
mov [rsp],rdi
add rdi,8
mov rsi,[rsp]
add rsi,18
lea rdx,[rsp+8]
call MyNamespace.Triplet.PlusOut(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
cmp dword ptr [rsp+8],1388
setl al
movzx eax,al
add rsp,18
ret
; Total bytes of code 63 ; MyNamespace.Triplet.PlusOut(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
push rbp
push r15
push r14
push rbx
push rax
lea rbp,[rsp+20]
mov rbx,rdx
mov r14d,[rdi]
cmp r14d,7FFFFFFF
je short M01_L00
mov r15d,[rsi]
cmp r15d,7FFFFFFF
jne short M01_L01
M01_L00:
mov rdi,7FBF90B9D5E0
mov esi,16
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
mov rax,7FBF700053D0
mov rax,[rax]
mov rdi,[rax+8]
mov [rbx],rdi
mov edi,[rax+10]
mov [rbx+8],edi
add rsp,8
pop rbx
pop r14
pop r15
pop rbp
ret
M01_L01:
add r14d,r15d
jo short M01_L02
mov eax,[rdi+4]
add eax,[rsi+4]
jo short M01_L02
mov edi,[rdi+8]
add edi,[rsi+8]
jo short M01_L02
mov [rbx],r14d
mov [rbx+4],eax
mov [rbx+8],edi
add rsp,8
pop rbx
pop r14
pop r15
pop rbp
ret
M01_L02:
call CORINFO_HELP_OVERFLOW
int 3
; Total bytes of code 143
|
The reason is #8887 (so I assume we can close this as a dup). Minimal repro: public struct Triplet
{
public int A, B, C;
public static Triplet Plus() => new () {A = 1, B = 2, C = 3};
} Windows-x64: mov dword ptr [rcx], 1
mov dword ptr [rcx+4], 2
mov dword ptr [rcx+8], 3
mov rax, rcx
ret Linux-x64: sub rsp, 24
xor eax, eax
mov qword ptr [rsp+08H], rax
mov qword ptr [rsp+0CH], rax
mov dword ptr [rsp+08H], 1
mov dword ptr [rsp+0CH], 2
mov dword ptr [rsp+10H], 3
mov rax, qword ptr [rsp+08H]
mov edx, dword ptr [rsp+10H]
add rsp, 24
ret So for SysV 64 ABI we introduce a struct copy on stack before returning it via two regs while on windows we write directly into return buffer. Temp solution: Mark your method as AggressiveInlining so it won't be a bottleneck |
Thank you for you quick response. I will try out the |
Cc @dotnet/jit-contrib. |
I tried out the |
Description
The
Triplet.Plus
method has different performance behaviour on Windows and Ubuntu. I rewrote this method using an out parameter, which made it a bit less performant on Windows and way more performant on Ubuntu. Obviously, I would like to avoid the variant with the out parameter, because it results in less readable code.The JIT also emitted different assembly. See below the code of the simplest repro I could make, which is a synthetic benchmark. I did a similar change on my ASP.NET Core 6.0 application, with similar conclusions.
I consider this a performance bug. Please let me know if I can provide further assistance.
Configuration
The Windows data I gather by running on my laptop. The Ubuntu data was gathered on an Azure VM (Standard D4ds v5). For more precide configuration information consult the 'Data' section.
Regression
I haven't tried any other versions of .NET.
Data
Windows
Ubuntu
Analysis
Windows
.NET 6.0.6 (6.0.622.26707), X64 RyuJIT
.NET 6.0.6 (6.0.622.26707), X64 RyuJIT
Ubuntu
.NET 6.0.6 (6.0.622.26707), X64 RyuJIT
.NET 6.0.6 (6.0.622.26707), X64 RyuJIT
The text was updated successfully, but these errors were encountered: