Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Bad codegen on win-x86 #90219

Closed
jakobbotsch opened this issue Aug 9, 2023 · 2 comments · Fixed by #90318
Closed

JIT: Bad codegen on win-x86 #90219

jakobbotsch opened this issue Aug 9, 2023 · 2 comments · Fixed by #90318
Assignees
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone

Comments

@jakobbotsch
Copy link
Member

// Generated by Fuzzlyn v1.5 on 2023-08-05 16:08:41
// Run on X86 Windows
// Seed: 7610693270284065118
// Reduced from 12.0 KiB to 2.2 KiB in 00:01:06
// Debug: Outputs 1
// Release: Outputs 2
public interface I0
{
}

public class C0 : I0
{
    public ulong F0;
    public uint F4;
    public C0(ulong f0, uint f4)
    {
        F0 = f0;
        F4 = f4;
    }
}

public class Program
{
    public static IRuntime s_rt;
    public static I0 s_1;
    public static uint s_2;
    public static byte[] s_4 = new byte[]{0};
    public static void Main()
    {
        CollectibleALC alc = new CollectibleALC();
        System.Reflection.Assembly asm = alc.LoadFromAssemblyPath(System.Reflection.Assembly.GetExecutingAssembly().Location);
        System.Reflection.MethodInfo mi = asm.GetType(typeof(Program).FullName).GetMethod(nameof(MainInner));
        System.Type runtimeTy = asm.GetType(typeof(Runtime).FullName);
        mi.Invoke(null, new object[]{System.Activator.CreateInstance(runtimeTy)});
    }

    public static void MainInner(IRuntime rt)
    {
        s_rt = rt;
        var vr8 = new C0(0, 0);
        var vr12 = s_4[0];
        vr8.F0 *= M2(vr12);
        long[] vr9 = new long[]{0};
        bool vr10 = (int)M2(0) <= vr9[0];
    }

    public static ulong M2(byte arg0)
    {
        for (int var0 = 0; var0 < 2; var0++)
        {
            var vr1 = new C0(0, 0);
            var vr3 = new C0(0, 0);
            uint vr14 = s_2;
            s_rt.WriteLine("c_0", vr14);
            uint vr15 = vr3.F4;
            s_2 = Program.M3(vr1, vr15);
            s_1 = s_1;
            s_1 = new C0(0, 0);
            s_rt.WriteLine("c_6", var0);
        }

        var vr5 = new C0(0, 1);
        uint vr18 = vr5.F4;
        arg0 = (byte)vr18;
        var vr7 = new C0(0, 1838341904U);
        if ((arg0 >= (byte)M3(vr7, 0)))
        {
            arg0 <<= 1;
        }

        s_rt.WriteLine("c_7", arg0);
        return 0;
    }

    public static uint M3(C0 argThis, uint arg0)
    {
        s_rt.WriteLine("c_0", arg0);
        return argThis.F4;
    }
}

public interface IRuntime
{
    void WriteLine<T>(string site, T value);
}

public class Runtime : IRuntime
{
    public void WriteLine<T>(string site, T value) => System.Console.WriteLine(value);
}

public class CollectibleALC : System.Runtime.Loader.AssemblyLoadContext
{
    public CollectibleALC(): base(true)
    {
    }
}
@dotnet-issue-labeler dotnet-issue-labeler bot added the needs-area-label An area label is needed to ensure this gets routed to the appropriate area owners label Aug 9, 2023
@ghost ghost added the untriaged New issue has not been triaged by the area owner label Aug 9, 2023
@jakobbotsch jakobbotsch added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI and removed untriaged New issue has not been triaged by the area owner needs-area-label An area label is needed to ensure this gets routed to the appropriate area owners labels Aug 9, 2023
@ghost
Copy link

ghost commented Aug 9, 2023

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Issue Details
// Generated by Fuzzlyn v1.5 on 2023-08-05 16:08:41
// Run on X86 Windows
// Seed: 7610693270284065118
// Reduced from 12.0 KiB to 2.2 KiB in 00:01:06
// Debug: Outputs 1
// Release: Outputs 2
public interface I0
{
}

public class C0 : I0
{
    public ulong F0;
    public uint F4;
    public C0(ulong f0, uint f4)
    {
        F0 = f0;
        F4 = f4;
    }
}

public class Program
{
    public static IRuntime s_rt;
    public static I0 s_1;
    public static uint s_2;
    public static byte[] s_4 = new byte[]{0};
    public static void Main()
    {
        CollectibleALC alc = new CollectibleALC();
        System.Reflection.Assembly asm = alc.LoadFromAssemblyPath(System.Reflection.Assembly.GetExecutingAssembly().Location);
        System.Reflection.MethodInfo mi = asm.GetType(typeof(Program).FullName).GetMethod(nameof(MainInner));
        System.Type runtimeTy = asm.GetType(typeof(Runtime).FullName);
        mi.Invoke(null, new object[]{System.Activator.CreateInstance(runtimeTy)});
    }

    public static void MainInner(IRuntime rt)
    {
        s_rt = rt;
        var vr8 = new C0(0, 0);
        var vr12 = s_4[0];
        vr8.F0 *= M2(vr12);
        long[] vr9 = new long[]{0};
        bool vr10 = (int)M2(0) <= vr9[0];
    }

    public static ulong M2(byte arg0)
    {
        for (int var0 = 0; var0 < 2; var0++)
        {
            var vr1 = new C0(0, 0);
            var vr3 = new C0(0, 0);
            uint vr14 = s_2;
            s_rt.WriteLine("c_0", vr14);
            uint vr15 = vr3.F4;
            s_2 = Program.M3(vr1, vr15);
            s_1 = s_1;
            s_1 = new C0(0, 0);
            s_rt.WriteLine("c_6", var0);
        }

        var vr5 = new C0(0, 1);
        uint vr18 = vr5.F4;
        arg0 = (byte)vr18;
        var vr7 = new C0(0, 1838341904U);
        if ((arg0 >= (byte)M3(vr7, 0)))
        {
            arg0 <<= 1;
        }

        s_rt.WriteLine("c_7", arg0);
        return 0;
    }

    public static uint M3(C0 argThis, uint arg0)
    {
        s_rt.WriteLine("c_0", arg0);
        return argThis.F4;
    }
}

public interface IRuntime
{
    void WriteLine<T>(string site, T value);
}

public class Runtime : IRuntime
{
    public void WriteLine<T>(string site, T value) => System.Console.WriteLine(value);
}

public class CollectibleALC : System.Runtime.Loader.AssemblyLoadContext
{
    public CollectibleALC(): base(true)
    {
    }
}
Author: jakobbotsch
Assignees: -
Labels:

area-CodeGen-coreclr, untriaged, needs-area-label

Milestone: -

@jakobbotsch jakobbotsch added this to the 8.0.0 milestone Aug 9, 2023
@jakobbotsch jakobbotsch self-assigned this Aug 10, 2023
@jakobbotsch
Copy link
Member Author

This is another case of morph using a subrange assertion to try to optimize away normalization, where the backend then does not satisfy the necessary invariants to make this work correctly. Some annotated codegen:

G_M21776_IG12:  ;; offset=0x010B
       mov      dword ptr [ecx+0x04], edx
       mov      dword ptr [eax+0x0C], 1
       movzx    ecx, byte  ptr [eax+0x0C]
       mov      byte  ptr [ebp-0x10], cl        ; arg0 = 0; 1 byte write
       mov      ecx, 0xAC1D838      ; C0
       call     CORINFO_HELP_NEWSFAST
       lea      ecx, bword ptr [eax+0x04]
       xor      edx, edx
       mov      dword ptr [ecx], edx
                                                ;; size=34 bbWeight=1 PerfScore 8.00
G_M21776_IG13:  ;; offset=0x012D
       mov      dword ptr [ecx+0x04], edx
       mov      dword ptr [eax+0x0C], 0x6D92DF10
       mov      ecx, eax
                                                ;; size=12 bbWeight=1 PerfScore 2.25
G_M21776_IG14:  ;; offset=0x0139
       call     [Program:M3(C0,uint):uint]
       movzx    ecx, al
       mov      eax, dword ptr [ebp-0x10]  ; load arg0; 4 byte read
       movzx    edx, al                    ; normalize it properly, ok
       add      edx, edx
       movzx    edx, dl
       cmp      ecx, eax                   ; use of unnormalized arg0; not ok
       cmovle   eax, edx
       mov      byte  ptr [ebp-0x10], al
       mov      esi, gword ptr [ebx]
       push     0xAC1DBB8
       mov      ecx, esi
       mov      edx, 0xAC1D338      ; IRuntime
       call     CORINFO_HELP_VIRTUAL_FUNC_PTR
       movzx    edx, byte  ptr [ebp-0x10]
       push     edx
       mov      edx, gword ptr [0x070F3020]      ; string handle
       mov      ecx, esi
       call     eax
       xor      eax, eax
       xor      edx, edx

jakobbotsch added a commit to jakobbotsch/runtime that referenced this issue Aug 10, 2023
The existing logic would sometimes unspill using the type of the local
that is being unspilled. This type is often wider than the exact small
type in the LclVarDsc, since NOL locals are normally expanded as
CAST(<small type>, LCL_VAR<int>).

This causes problems since morph will in some cases avoid inserting
normalization for NOL locals when it has a subrange assertion available.
This optimization relies on the backend always ensuring that the local
will be normalized as part of unspilling and args homing.

Size-wise regressions are expected on xarch since the encoding of the
normalizing load is larger. However, as we have seen before, using wide
loads can cause significant store-forwarding stalls which can have large
negative impact on performance, so overall there is an expected perf
benefit of using the small loads (in addition to the correctness fix).

Fix dotnet#90219
@ghost ghost added the in-pr There is an active PR which will close this issue when it is merged label Aug 10, 2023
jakobbotsch added a commit that referenced this issue Aug 14, 2023
The existing logic would sometimes unspill using the type of the local
that is being unspilled. This type is often wider than the exact small
type in the LclVarDsc, since NOL locals are normally expanded as
CAST(<small type>, LCL_VAR<int>).

This causes problems since morph will in some cases avoid inserting
normalization for NOL locals when it has a subrange assertion available.
This optimization relies on the backend always ensuring that the local
will be normalized as part of unspilling and args homing.

Size-wise regressions are expected on xarch since the encoding of the
normalizing load is larger. However, as we have seen before, using wide
loads can cause significant store-forwarding stalls which can have large
negative impact on performance, so overall there is an expected perf
benefit of using the small loads (in addition to the correctness fix).

Fix #90219
@ghost ghost removed the in-pr There is an active PR which will close this issue when it is merged label Aug 14, 2023
@ghost ghost locked as resolved and limited conversation to collaborators Sep 13, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Projects
None yet
Development

Successfully merging a pull request may close this issue.

1 participant