Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Devirtualization issues in non-shared generics #37717

Closed
YohDeadfall opened this issue Jun 10, 2020 · 6 comments
Closed

Devirtualization issues in non-shared generics #37717

YohDeadfall opened this issue Jun 10, 2020 · 6 comments
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI tenet-performance Performance related issue
Milestone

Comments

@YohDeadfall
Copy link
Contributor

YohDeadfall commented Jun 10, 2020

During benchmarking an optimization involving EqualityComparer<T>.Default I found an issue - if the comparer is used in a generic type devirtualization doesn't happen for some cases. In the example below only decimal comparison is inlined well in case of generics, but not for long. At the same time the JIT heavily optimized the code for long comparison in a non-generic code.

Generic

[SimpleJob(RuntimeMoniker.NetCoreApp50)]
[DisassemblyDiagnoser]
public abstract class Benchmark<T>
{
    private static IEqualityComparer<T> s_defaultComparer = EqualityComparer<T>.Default;

    [Benchmark]
    [ArgumentsSource(nameof(Values))]
    public bool IsDefaultInlined(T value) => EqualityComparer<T>.Default.Equals(default, value);

    public IEnumerable<object> Values()
    {
        foreach (var value in ValuesCore())
            yield return value;
    }

    protected abstract IEnumerable<T> ValuesCore();
}

public class Int64Benchmark : Benchmark<long>
{
    protected override IEnumerable<long> ValuesCore() => new[] { default, 2L };
}

public class DecimalBenchmark : Benchmark<decimal>
{
    protected override IEnumerable<decimal> ValuesCore() => new[] { default, 2.7M };
}

Int64Benchmark

; Json.Benchmark`1[[System.Int64, System.Private.CoreLib]].IsDefaultInlined(Int64)
       sub       rsp,28
       mov       rcx,14A100059B8
       mov       rcx,[rcx]
       mov       r8,rdx
       xor       edx,edx
       cmp       [rcx],ecx
       call      qword ptr [7FFC9C55F3D0]
       nop
       add       rsp,28
       ret
; Total bytes of code 36

; System.Collections.Generic.GenericEqualityComparer`1[[System.Int64, System.Private.CoreLib]].Equals(Int64, Int64)
       cmp       rdx,r8
       sete      al
       movzx     eax,al
       ret
; Total bytes of code 10

DecimalBenchmark

; Json.Benchmark`1[[System.Decimal, System.Private.CoreLib]].IsDefaultInlined(System.Decimal)
       sub       rsp,58
       vzeroupper
       vxorps    xmm0,xmm0,xmm0
       vmovdqu   xmmword ptr [rsp+48],xmm0
       vmovdqu   xmm0,xmmword ptr [rdx]
       vmovdqu   xmmword ptr [rsp+38],xmm0
       vmovupd   xmm0,[rsp+38]
       vmovupd   [rsp+28],xmm0
       lea       rcx,[rsp+48]
       lea       rdx,[rsp+28]
       call      System.Decimal+DecCalc.VarDecCmp(System.Decimal ByRef, System.Decimal ByRef)
       test      eax,eax
       sete      al
       movzx     eax,al
       add       rsp,58
       ret
; Total bytes of code 67

; System.Decimal+DecCalc.VarDecCmp(System.Decimal ByRef, System.Decimal ByRef)
       mov       eax,[rdx+8]
       or        eax,[rdx+0C]
       or        eax,[rdx+4]
       jne       short M01_L01
       mov       eax,[rcx+8]
       or        eax,[rcx+0C]
       or        eax,[rcx+4]
       jne       short M01_L00
       xor       eax,eax
       ret
M01_L00:
       mov       eax,[rcx]
       sar       eax,1F
       or        eax,1
       jmp       short M01_L03
M01_L01:
       mov       eax,[rcx+8]
       or        eax,[rcx+0C]
       or        eax,[rcx+4]
       jne       short M01_L02
       mov       ecx,[rdx]
       sar       ecx,1F
       or        ecx,1
       mov       eax,ecx
       neg       eax
       jmp       short M01_L03
M01_L02:
       mov       eax,[rcx]
       sar       eax,1F
       mov       r8d,[rdx]
       sar       r8d,1F
       sub       eax,r8d
       test      eax,eax
       je        short M01_L04
M01_L03:
       ret
M01_L04:
       jmp       near ptr System.Decimal+DecCalc.VarDecCmpSub(System.Decimal ByRef, System.Decimal ByRef)
; Total bytes of code 85

Non generic

[SimpleJob(RuntimeMoniker.NetCoreApp50)]
[DisassemblyDiagnoser]
public class Int64BenchmarkNonGeneric
{
    private static IEqualityComparer<long> s_defaultComparer = EqualityComparer<long>.Default;

    [Benchmark]
    [ArgumentsSource(nameof(Values))]
    public bool IsDefaultInlined(long value) => EqualityComparer<long>.Default.Equals(default, value);

    [Benchmark]
    [ArgumentsSource(nameof(Values))]
    public bool IsDefaultStored(long value) => s_defaultComparer.Equals(default, value);

    public IEnumerable<object> Values() => new object[] { 0L, 2L };
}

[SimpleJob(RuntimeMoniker.NetCoreApp50)]
[DisassemblyDiagnoser]
public class DecimalBenchmarkNonGeneric
{
    private static IEqualityComparer<decimal> s_defaultComparer = EqualityComparer<decimal>.Default;

    [Benchmark]
    [ArgumentsSource(nameof(Values))]
    public bool IsDefaultInlined(decimal value) => EqualityComparer<decimal>.Default.Equals(default, value);

    [Benchmark]
    [ArgumentsSource(nameof(Values))]
    public bool IsDefaultStored(decimal value) => s_defaultComparer.Equals(default, value);

    public IEnumerable<object> Values() => new object[] { 0M, 2M };
}

Int64BenchmarkNonGeneric

; Json.Int64BenchmarkNonGeneric.IsDefaultInlined(Int64)
       test      rdx,rdx
       sete      al
       movzx     eax,al
       ret
; Total bytes of code 10

DecimalBenchmarkNonGeneric

; Json.DecimalBenchmarkNonGeneric.IsDefaultInlined(System.Decimal)
       sub       rsp,58
       vzeroupper
       mov       rcx,1F4BD271050
       mov       rcx,[rcx]
       vmovdqu   xmm0,xmmword ptr [rcx+8]
       vmovdqu   xmmword ptr [rsp+48],xmm0
       vmovdqu   xmm0,xmmword ptr [rdx]
       vmovdqu   xmmword ptr [rsp+38],xmm0
       vmovupd   xmm0,[rsp+38]
       vmovupd   [rsp+28],xmm0
       lea       rcx,[rsp+48]
       lea       rdx,[rsp+28]
       call      System.Decimal+DecCalc.VarDecCmp(System.Decimal ByRef, System.Decimal ByRef)
       test      eax,eax
       sete      al
       movzx     eax,al
       add       rsp,58
       ret
; Total bytes of code 81

; System.Decimal+DecCalc.VarDecCmp(System.Decimal ByRef, System.Decimal ByRef)
       mov       eax,[rdx+8]
       or        eax,[rdx+0C]
       or        eax,[rdx+4]
       jne       short M01_L01
       mov       eax,[rcx+8]
       or        eax,[rcx+0C]
       or        eax,[rcx+4]
       jne       short M01_L00
       xor       eax,eax
       ret
M01_L00:
       mov       eax,[rcx]
       sar       eax,1F
       or        eax,1
       jmp       short M01_L03
M01_L01:
       mov       eax,[rcx+8]
       or        eax,[rcx+0C]
       or        eax,[rcx+4]
       jne       short M01_L02
       mov       ecx,[rdx]
       sar       ecx,1F
       or        ecx,1
       mov       eax,ecx
       neg       eax
       jmp       short M01_L03
M01_L02:
       mov       eax,[rcx]
       sar       eax,1F
       mov       r8d,[rdx]
       sar       r8d,1F
       sub       eax,r8d
       test      eax,eax
       je        short M01_L04
M01_L03:
       ret
M01_L04:
       jmp       near ptr System.Decimal+DecCalc.VarDecCmpSub(System.Decimal ByRef, System.Decimal ByRef)
; Total bytes of code 85

/cc @AndyAyersMS

category:cq
theme:devirtualization
skill-level:expert
cost:large

@YohDeadfall YohDeadfall added the tenet-performance Performance related issue label Jun 10, 2020
@Dotnet-GitSync-Bot Dotnet-GitSync-Bot added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI untriaged New issue has not been triaged by the area owner labels Jun 10, 2020
@EgorBo
Copy link
Member

EgorBo commented Jun 11, 2020

duplicates #10050 ?

@YohDeadfall
Copy link
Contributor Author

No, the issue you referenced is about shared generics. This one is about non-shared generics/value types.

@BruceForstall BruceForstall removed the untriaged New issue has not been triaged by the area owner label Jun 12, 2020
@BruceForstall BruceForstall added this to the Future milestone Jun 12, 2020
@AndyAyersMS
Copy link
Member

There are two versions of GenericEqualityComparer<T> and I wonder if we're binding to the wrong one here. The one we want is where T is constrained to be IEquatable<T>. There is also an unconstrained version.

Looking at a variant of code the above, the jit fails to devirtualize initially because it doesn't see the comparer class as sealed/final (which it should be, if we have the right one):

impDevirtualizeCall: Trying to devirtualize virtual call:
    class for 'this' is System.Collections.Generic.EqualityComparer`1[Int64] (attrib 20000400)
    base method is System.Collections.Generic.EqualityComparer`1[Int64]::Equals
    devirt to System.Collections.Generic.EqualityComparer`1[Int64][System.Int64]::Equals -- inexact or not final
               [000012] --C-G-------              *  CALLV ind int    System.Collections.Generic.EqualityComparer`1[Int64][System.Int64].Equals
               [000009] ------------ this in rcx  +--*  LCL_VAR   ref    V04 tmp1         
               [000010] ------------ arg1         +--*  LCL_VAR   long   V02 loc0         
               [000011] ------------ arg2         \--*  LCL_VAR   long   V01 arg1         
    Class not final or exact, and method not final
NOT Marking call [000012] as guarded devirtualization candidate -- disabled by jit config

but after inlining (and simulating what we'd see for Tier1 code), the jit can can devirt as it can see the type of the readonly static and so does not need to depend on finalness anymore:

impDevirtualizeCall: Trying to devirtualize virtual call:
    class for 'this' is System.Collections.Generic.GenericEqualityComparer`1[Int64] [exact] (attrib 20000010)
    base method is System.Collections.Generic.EqualityComparer`1[Int64]::Equals
    devirt to System.Collections.Generic.GenericEqualityComparer`1[Int64][System.Int64]::Equals -- exact
               [000012] --C-G-------              *  CALLV ind int    System.Collections.Generic.EqualityComparer`1[Int64][System.Int64].Equals
               [000009] ------------ this in rcx  +--*  LCL_VAR   ref    V04 tmp1         
               [000010] ------------ arg1         +--*  LCL_VAR   long   V02 loc0         
               [000011] ------------ arg2         \--*  LCL_VAR   long   V01 arg1         
    exact; can devirtualize

we can't inline at this point, so the resulting code looks similar to what you'd see if we didn't devirtualize, but it should run a bit faster.

@AndyAyersMS
Copy link
Member

AndyAyersMS commented Jun 12, 2020

Actually it's simpler than that, the get_Default recognition requires that that call feed directly into the callvirt and here we've spilled to a temp because of the initobj:

IL_0000  28 04 00 00 0a    call         0xA000004      // get_Default
IL_0005  12 00             ldloca.s     0x0
IL_0007  fe 15 02 00 00 1b initobj      0x1B000002
IL_000d  06                ldloc.0     
IL_000e  03                ldarg.1     
IL_000f  6f 05 00 00 0a    callvirt     0xA000005   // Equals that we'd like to devirt
IL_0014  2a                ret  

post-import IR:

***** BB01
STMT00000 (IL 0x000...0x008)
               [000000] I-C-G-------              *  CALL      ref    System.Collections.Generic.EqualityComparer`1[Int64][System.Int64].get_Default (exactContextHnd=0x00007FFBFD9A9201)

***** BB01
STMT00002 (IL   ???...  ???)
               [000008] -AC---------              *  ASG       ref   
               [000007] D------N----              +--*  LCL_VAR   ref    V04 tmp1         
               [000001] --C---------              \--*  RET_EXPR  ref   (inl return from call [000000])

***** BB01
STMT00001 (IL   ???...  ???)
               [000006] IA----------              *  ASG       struct (init)
               [000005] -------N----              +--*  BLK       struct<8>
               [000003] ------------              |  \--*  ADDR      byref 
               [000002] -------N----              |     \--*  LCL_VAR   long   V02 loc0         
               [000004] ------------              \--*  CNS_INT   int    0

***** BB01
STMT00003 (IL   ???...0x014)
               [000013] --C-G-------              *  RETURN    int   
               [000012] --C-G-------              \--*  CALLV ind int    System.Collections.Generic.EqualityComparer`1[Int64][System.Int64].Equals
               [000009] ------------ this in rcx     +--*  LCL_VAR   ref    V04 tmp1         
               [000010] ------------ arg1            +--*  LCL_VAR   long   V02 loc0         
               [000011] ------------ arg2            \--*  LCL_VAR   long   V01 arg1 

In the non-virtual case there's just a constant there so no spilling happens:

IL_0000  28 04 00 00 0a    call         0xA000004
IL_0005  16                ldc.i4.0    
IL_0006  6a                conv.i8     
IL_0007  03                ldarg.1     
IL_0008  6f 05 00 00 0a    callvirt     0xA000005
IL_000d  2a                ret         

.... [ 3]   8 (0x008) callvirt 0A000005 (Implicit Tail call: prefixFlags |= PREFIX_TAILCALL_IMPLICIT)
In Compiler::impImportCall: opcode is callvirt, kind=4, callRetType is bool, structSize is 0
Special intrinsic: looking for exact type returned by System.Collections.Generic.EqualityComparer`1[Int64][System.Int64]:get_Default():System.Collections.Generic.EqualityComparer`1[Int64]
Named Intrinsic System.Collections.Generic.EqualityComparer`1.get_Default: Recognized
Special intrinsic for type System.Int64: return type is System.Collections.Generic.GenericEqualityComparer`1[Int64]

impDevirtualizeCall: Trying to devirtualize virtual call:
    class for 'this' is System.Collections.Generic.GenericEqualityComparer`1[Int64] [exact] (attrib 20000010)
    base method is System.Collections.Generic.EqualityComparer`1[Int64]::Equals
    devirt to System.Collections.Generic.GenericEqualityComparer`1[Int64][System.Int64]::Equals -- exact
               [000005] --C-G-------              *  CALLV ind int    System.Collections.Generic.EqualityComparer`1[Int64][System.Int64].Equals
               [000001] --C--------- this in rcx  +--*  RET_EXPR  ref   (inl return from call [000000])
               [000003] ------------ arg1         +--*  CAST      long <- int
               [000002] ------------              |  \--*  CNS_INT   int    0
               [000004] ------------ arg2         \--*  LCL_VAR   long   V01 arg1         
    exact; can devirtualize

So the root cause is that for a local of primitive typed T, ldloca; initobj does not get modelled in the same way as loading a constant zero.

@AndyAyersMS
Copy link
Member

If we aim to relax initobj spilling we should watch for cases like we saw in #764.

@BruceForstall BruceForstall added the JitUntriaged CLR JIT issues needing additional triage label Oct 28, 2020
@BruceForstall BruceForstall removed the JitUntriaged CLR JIT issues needing additional triage label Nov 7, 2020
@jakobbotsch
Copy link
Member

This looks fixed, codegen on main after tiering is now:

; Assembly listing for method C+Benchmark`1[long]:IsDefaultInlined(long):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 1 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd single-def
;  V01 arg1         [V01,T00] (  3,  3   )    long  ->  rdx         single-def
;* V02 loc0         [V02    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op
;# V03 OutArgs      [V03    ] (  1,  1   )  lclBlk ( 0) [rsp+00H]   "OutgoingArgSpace"
;* V04 tmp1         [V04,T02] (  0,  0   )     ref  ->  zero-ref    class-hnd exact single-def "impAppendStmt"
;  V05 tmp2         [V05,T01] (  2,  2   )    bool  ->  rax         single-def "Inline return value spill temp"
;* V06 tmp3         [V06    ] (  0,  0   )    long  ->  zero-ref    ld-addr-op "Inlining Arg"
;
; Lcl frame size = 0

G_M22461_IG01:
                                                ;; size=0 bbWeight=1    PerfScore 0.00
G_M22461_IG02:
       xor      eax, eax
       test     rdx, rdx
       sete     al
                                                ;; size=8 bbWeight=1    PerfScore 1.50
G_M22461_IG03:
       ret
                                                ;; size=1 bbWeight=1    PerfScore 1.00

; Total bytes of code 9, prolog size 0, PerfScore 3.40, instruction count 4, allocated bytes for code 9 (MethodHash=87b5a842) for method C+Benchmark`1[long]:IsDefaultInlined(long):bool:this
; ============================================================

@ghost ghost locked as resolved and limited conversation to collaborators Jan 6, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI tenet-performance Performance related issue
Projects
None yet
Development

No branches or pull requests

6 participants