Description
Description
The Triplet.Plus
method has different performance behaviour on Windows and Ubuntu. I rewrote this method using an out parameter, which made it a bit less performant on Windows and way more performant on Ubuntu. Obviously, I would like to avoid the variant with the out parameter, because it results in less readable code.
The JIT also emitted different assembly. See below the code of the simplest repro I could make, which is a synthetic benchmark. I did a similar change on my ASP.NET Core 6.0 application, with similar conclusions.
I consider this a performance bug. Please let me know if I can provide further assistance.
using System.Runtime.InteropServices;
namespace MyNamespace;
[StructLayout(LayoutKind.Sequential, Pack = 1)]
public readonly struct Triplet
{
public int A { get; }
public int B { get; }
public int C { get; }
public Triplet(int a, int b, int c)
{
A = a;
B = b;
C = c;
}
public static readonly Triplet Max = new Triplet(int.MaxValue, int.MaxValue, int.MaxValue);
public bool IsMax => A == int.MaxValue;
public static Triplet Plus(in Triplet left, in Triplet right)
{
if (left.IsMax || right.IsMax)
{
return Max;
}
checked
{
int a = left.A + right.A;
int b = left.B + right.B;
int c = left.C + right.C;
return new Triplet(a, b, c);
}
}
public static void PlusOut(in Triplet left, in Triplet right, out Triplet result)
{
if (left.IsMax || right.IsMax)
{
result = Max;
return;
}
checked
{
int a = left.A + right.A;
int b = left.B + right.B;
int c = left.C + right.C;
result = new Triplet(a, b, c);
}
}
}
using BenchmarkDotNet.Attributes;
[DisassemblyDiagnoser(printSource: true, exportGithubMarkdown: true, exportDiff: true)]
public class TripletBenchmark
{
private readonly Triplet tripletA = new(100, 100, 0);
private readonly Triplet tripletB = new(2340, 100, 0);
[Benchmark(Baseline = true)]
public bool TripletPlus()
{
return Triplet.Plus(tripletA, tripletB).A < 5000;
}
[Benchmark]
public bool TripletPlusOut()
{
Triplet.PlusOut(tripletA, tripletB, out Triplet temp);
return temp.A < 5000;
}
}
Configuration
The Windows data I gather by running on my laptop. The Ubuntu data was gathered on an Azure VM (Standard D4ds v5). For more precide configuration information consult the 'Data' section.
Regression
I haven't tried any other versions of .NET.
Data
Windows
BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19042.1766 (20H2/October2020Update)
Intel Core i7-7600U CPU 2.80GHz (Kaby Lake), 1 CPU, 4 logical and 2 physical cores
.NET SDK=6.0.301
[Host] : .NET 6.0.6 (6.0.622.26707), X64 RyuJIT
DefaultJob : .NET 6.0.6 (6.0.622.26707), X64 RyuJIT
Method | Mean | Error | StdDev | Median | Ratio | RatioSD | Code Size |
---|---|---|---|---|---|---|---|
TripletPlus | 3.110 ns | 0.0711 ns | 0.0631 ns | 3.112 ns | 1.00 | 0.00 | 190 B |
TripletPlusOut | 3.510 ns | 0.3743 ns | 1.0799 ns | 3.077 ns | 1.18 | 0.34 | 191 B |
Ubuntu
BenchmarkDotNet=v0.13.1, OS=ubuntu 20.04
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 4 logical and 2 physical cores
.NET SDK=6.0.301
[Host] : .NET 6.0.6 (6.0.622.26707), X64 RyuJIT
DefaultJob : .NET 6.0.6 (6.0.622.26707), X64 RyuJIT
Method | Mean | Error | StdDev | Ratio | Code Size |
---|---|---|---|---|---|
TripletPlus | 8.537 ns | 0.0032 ns | 0.0028 ns | 1.00 | 214 B |
TripletPlusOut | 2.315 ns | 0.0036 ns | 0.0033 ns | 0.27 | 206 B |
Analysis
Windows
.NET 6.0.6 (6.0.622.26707), X64 RyuJIT
; MyNamespace.TripletBenchmark.TripletPlus()
sub rsp,38
xor eax,eax
mov [rsp+28],rax
mov [rsp+30],rax
cmp [rcx],ecx
lea rdx,[rcx+8]
lea r8,[rcx+18]
lea rcx,[rsp+28]
call MyNamespace.Triplet.Plus(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
cmp dword ptr [rsp+28],1388
setl al
movzx eax,al
add rsp,38
ret
; Total bytes of code 55
; MyNamespace.Triplet.Plus(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
push rdi
push rsi
push rbx
sub rsp,20
mov rsi,rcx
mov edi,[rdx]
cmp edi,7FFFFFFF
je short M01_L00
mov ebx,[r8]
cmp ebx,7FFFFFFF
jne short M01_L01
M01_L00:
mov rcx,7FF7C7934FC8
mov edx,16
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
mov rax,1F033C37350
mov rax,[rax]
mov rdx,[rax+8]
mov [rsi],rdx
mov edx,[rax+10]
mov [rsi+8],edx
mov rax,rsi
add rsp,20
pop rbx
pop rsi
pop rdi
ret
M01_L01:
add edi,ebx
jo short M01_L02
mov eax,[rdx+4]
add eax,[r8+4]
jo short M01_L02
mov edx,[rdx+8]
add edx,[r8+8]
jo short M01_L02
mov [rsi],edi
mov [rsi+4],eax
mov [rsi+8],edx
mov rax,rsi
add rsp,20
pop rbx
pop rsi
pop rdi
ret
M01_L02:
call CORINFO_HELP_OVERFLOW
int 3
; Total bytes of code 135
.NET 6.0.6 (6.0.622.26707), X64 RyuJIT
; MyNamespace.TripletBenchmark.TripletPlusOut()
sub rsp,38
xor eax,eax
mov [rsp+28],rax
mov [rsp+30],rax
cmp [rcx],ecx
mov [rsp+40],rcx
add rcx,8
mov rdx,[rsp+40]
add rdx,18
lea r8,[rsp+28]
call MyNamespace.Triplet.PlusOut(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
cmp dword ptr [rsp+28],1388
setl al
movzx eax,al
add rsp,38
ret
; Total bytes of code 65
; MyNamespace.Triplet.PlusOut(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
push rdi
push rsi
push rbx
sub rsp,20
mov rsi,r8
mov edi,[rcx]
cmp edi,7FFFFFFF
je short M01_L00
mov ebx,[rdx]
cmp ebx,7FFFFFFF
jne short M01_L01
M01_L00:
mov rcx,7FF7C7914FC8
mov edx,16
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
mov rax,16818E97350
mov rax,[rax]
mov rdx,[rax+8]
mov [rsi],rdx
mov edx,[rax+10]
mov [rsi+8],edx
add rsp,20
pop rbx
pop rsi
pop rdi
ret
M01_L01:
add edi,ebx
jo short M01_L02
mov eax,[rcx+4]
add eax,[rdx+4]
jo short M01_L02
mov ecx,[rcx+8]
add ecx,[rdx+8]
jo short M01_L02
mov [rsi],edi
mov [rsi+4],eax
mov [rsi+8],ecx
add rsp,20
pop rbx
pop rsi
pop rdi
ret
M01_L02:
call CORINFO_HELP_OVERFLOW
int 3
; Total bytes of code 126
Ubuntu
.NET 6.0.6 (6.0.622.26707), X64 RyuJIT
; MyNamespace.TripletBenchmark.TripletPlus()
sub rsp,18
cmp [rdi],edi
mov [rsp],rdi
add rdi,8
mov rsi,[rsp]
add rsi,18
call MyNamespace.Triplet.Plus(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
mov [rsp+8],rax
mov [rsp+10],edx
cmp dword ptr [rsp+8],1388
setl al
movzx eax,al
add rsp,18
ret
; Total bytes of code 55
; MyNamespace.Triplet.Plus(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
push rbp
push r14
push rbx
sub rsp,20
lea rbp,[rsp+30]
mov ebx,[rdi]
cmp ebx,7FFFFFFF
je short M01_L00
mov r14d,[rsi]
cmp r14d,7FFFFFFF
jne short M01_L01
M01_L00:
mov rdi,7F945206D5E0
mov esi,16
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
mov rax,7F94340053D0
mov rax,[rax]
mov rdi,[rax+8]
mov [rbp-20],rdi
mov edi,[rax+10]
mov [rbp-18],edi
mov rax,[rbp-20]
mov edx,[rbp-18]
add rsp,20
pop rbx
pop r14
pop rbp
ret
M01_L01:
add ebx,r14d
jo short M01_L02
mov eax,[rdi+4]
add eax,[rsi+4]
jo short M01_L02
mov edi,[rdi+8]
add edi,[rsi+8]
jo short M01_L02
xor esi,esi
mov [rbp-30],rsi
mov [rbp-28],esi
mov [rbp-30],ebx
mov [rbp-2C],eax
mov [rbp-28],edi
mov rax,[rbp-30]
mov edx,[rbp-28]
add rsp,20
pop rbx
pop r14
pop rbp
ret
M01_L02:
call CORINFO_HELP_OVERFLOW
int 3
; Total bytes of code 159
.NET 6.0.6 (6.0.622.26707), X64 RyuJIT
; MyNamespace.TripletBenchmark.TripletPlusOut()
sub rsp,18
xor eax,eax
mov [rsp+8],rax
mov [rsp+10],rax
cmp [rdi],edi
mov [rsp],rdi
add rdi,8
mov rsi,[rsp]
add rsi,18
lea rdx,[rsp+8]
call MyNamespace.Triplet.PlusOut(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
cmp dword ptr [rsp+8],1388
setl al
movzx eax,al
add rsp,18
ret
; Total bytes of code 63
; MyNamespace.Triplet.PlusOut(MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef, MyNamespace.Triplet ByRef)
push rbp
push r15
push r14
push rbx
push rax
lea rbp,[rsp+20]
mov rbx,rdx
mov r14d,[rdi]
cmp r14d,7FFFFFFF
je short M01_L00
mov r15d,[rsi]
cmp r15d,7FFFFFFF
jne short M01_L01
M01_L00:
mov rdi,7FBF90B9D5E0
mov esi,16
call CORINFO_HELP_GETSHARED_NONGCSTATIC_BASE
mov rax,7FBF700053D0
mov rax,[rax]
mov rdi,[rax+8]
mov [rbx],rdi
mov edi,[rax+10]
mov [rbx+8],edi
add rsp,8
pop rbx
pop r14
pop r15
pop rbp
ret
M01_L01:
add r14d,r15d
jo short M01_L02
mov eax,[rdi+4]
add eax,[rsi+4]
jo short M01_L02
mov edi,[rdi+8]
add edi,[rsi+8]
jo short M01_L02
mov [rbx],r14d
mov [rbx+4],eax
mov [rbx+8],edi
add rsp,8
pop rbx
pop r14
pop r15
pop rbp
ret
M01_L02:
call CORINFO_HELP_OVERFLOW
int 3
; Total bytes of code 143