From 83d0b088f74ff87b039c02355179ee1dada6cf93 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Sat, 19 Apr 2025 19:22:50 -0700 Subject: [PATCH 1/2] Fix typing of BlendVariableMask condition when optimizing TernaryLogic TernaryLogic nodes normalize small types to large types. When optimizing a TernaryLogic node to a BlendVariableMask, if we leave the normalized type, then the Blend uses the wrong instruction. To fix this, change the Blend node to use the mask simd base type. We only do this for mask base types that are small, which is unsatisfying: there exists codegen today for double->int casts, for example, that uses a 'double' type mask but an 'int' sized TernaryLogic node. This works because we end up only using lane '0', as the vector is converted to scalar. I didn't find any other case like this, so hopefully if the mask type is small we can safely use it. Fixes #114572 --- src/coreclr/jit/lowerxarch.cpp | 22 ++++++ .../JitBlue/Runtime_114572/Runtime_114572.cs | 72 +++++++++++++++++++ .../Runtime_114572/Runtime_114572.csproj | 8 +++ 3 files changed, 102 insertions(+) create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_114572/Runtime_114572.cs create mode 100644 src/tests/JIT/Regression/JitBlue/Runtime_114572/Runtime_114572.csproj diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index bac3a193332185..2f222ecd7b1f63 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3875,6 +3875,28 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) } assert(varTypeIsMask(condition)); + + // The TernaryLogic node normalizes small SIMD base types on import. To optimize + // to BlendVariableMask, we need to "un-normalize". We no longer have the original + // base type, so we use the mask base type instead. + NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + assert(HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsicId)); + + // The condition mask element size is expected to be the same size as the TernaryLogic + // node element size, unless the condition element size is short, in which case the + // TernaryLogic node element size would have been normalized. + // However, there is code, such as for double->int conversions, that generates a 'double' + // base type mask for a TernaryLogic node with base type 'int'. That works because it only + // cares about the '0' elem, as the result will be cast to scalar. + var_types conditionBaseType = condition->AsHWIntrinsic()->GetSimdBaseType(); + uint32_t conditionElemSize = genTypeSize(conditionBaseType); + uint32_t elemSize = genTypeSize(simdBaseType); + if (varTypeIsSmall(conditionBaseType) && (conditionElemSize < elemSize)) + { + CorInfoType simdBaseJitTypeCondition = condition->AsHWIntrinsic()->GetSimdBaseJitType(); + node->AsHWIntrinsic()->SetSimdBaseJitType(simdBaseJitTypeCondition); + } + node->ResetHWIntrinsicId(NI_EVEX_BlendVariableMask, comp, selectFalse, selectTrue, condition); BlockRange().Remove(op4); break; diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_114572/Runtime_114572.cs b/src/tests/JIT/Regression/JitBlue/Runtime_114572/Runtime_114572.cs new file mode 100644 index 00000000000000..8b09bc0c167305 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_114572/Runtime_114572.cs @@ -0,0 +1,72 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// +// Generated by Fuzzlyn v2.5 on 2025-04-11 19:29:41 +// Run on X64 Windows +// Seed: 557319528607462789-vectort,vector128,vector256,x86aes,x86avx,x86avx2,x86avx512bw,x86avx512bwvl,x86avx512cd,x86avx512cdvl,x86avx512dq,x86avx512dqvl,x86avx512f,x86avx512fvl,x86avx512fx64,x86bmi1,x86bmi1x64,x86bmi2,x86bmi2x64,x86fma,x86lzcnt,x86lzcntx64,x86pclmulqdq,x86popcnt,x86popcntx64,x86sse,x86ssex64,x86sse2,x86sse2x64,x86sse3,x86sse41,x86sse41x64,x86sse42,x86sse42x64,x86ssse3,x86x86base +// Reduced from 41.1 KiB to 0.7 KiB in 00:01:23 +// Debug: Outputs <0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> +// Release: Outputs <0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1> + +using System; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using Xunit; + +public class Runtime_114572 +{ + public static Vector256 s_2; + public static ushort s_4; + public static byte s_ub = 1; + public static ushort s_us = 1; + + [Fact] + public static void Problem() + { + if (Avx512F.VL.IsSupported) + { + var vr11 = Vector256.Create(0); + var vr12 = Vector256.Create(1); + var vr13 = (ushort)0; + var vr14 = Vector256.CreateScalar(vr13); + var vr15 = (ushort)1; + var vr16 = Vector256.CreateScalar(vr15); + var vr17 = Vector256.Create(s_4); + var vr18 = Avx2.Max(vr16, vr17); + s_2 = Avx512F.VL.TernaryLogic(vr11, vr12, Avx512BW.VL.CompareGreaterThanOrEqual(vr14, vr18), 216); + System.Console.WriteLine(s_2); + Assert.Equal(Vector256.Create((ushort)0, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1, (ushort)1), s_2); + } + } + + [Fact] + public static void Problem2() + { + if (Avx512F.VL.IsSupported) + { + var v0 = Vector128.Zero; + var v1 = Vector128.One; + var vs = Vector128.CreateScalar(s_us); + + var vr = Avx512F.VL.TernaryLogic(v0, v1, Avx512BW.VL.CompareGreaterThanOrEqual(v0, vs), 0xE4); + Console.WriteLine(vr); + Assert.Equal(Vector128.Create((ushort)1, (ushort)0, (ushort)0, (ushort)0, (ushort)0, (ushort)0, (ushort)0, (ushort)0), vr); + } + } + + [Fact] + public static void Problem3() + { + if (Avx512F.VL.IsSupported) + { + var v0 = Vector128.Zero; + var v1 = Vector128.One; + var vs = Vector128.CreateScalar(s_ub); + + var vr = Avx512F.VL.TernaryLogic(v0, v1, Avx512BW.VL.CompareGreaterThanOrEqual(v0, vs), 0xE4); + Console.WriteLine(vr); + Assert.Equal(Vector128.Create((byte)1, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0, (byte)0), vr); + } + } +} diff --git a/src/tests/JIT/Regression/JitBlue/Runtime_114572/Runtime_114572.csproj b/src/tests/JIT/Regression/JitBlue/Runtime_114572/Runtime_114572.csproj new file mode 100644 index 00000000000000..de6d5e08882e86 --- /dev/null +++ b/src/tests/JIT/Regression/JitBlue/Runtime_114572/Runtime_114572.csproj @@ -0,0 +1,8 @@ + + + True + + + + + From 88ccf95060747880dfefe727d40be09e4e7dc965 Mon Sep 17 00:00:00 2001 From: Bruce Forstall Date: Fri, 25 Apr 2025 17:03:22 -0700 Subject: [PATCH 2/2] Unconditionally set the base element type to the condition base element type But only if the condition is a HW intrinsic. --- src/coreclr/jit/lowerxarch.cpp | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 2f222ecd7b1f63..41f05cafb6d481 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3882,21 +3882,13 @@ GenTree* Lowering::LowerHWIntrinsicTernaryLogic(GenTreeHWIntrinsic* node) NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); assert(HWIntrinsicInfo::NeedsNormalizeSmallTypeToInt(intrinsicId)); - // The condition mask element size is expected to be the same size as the TernaryLogic - // node element size, unless the condition element size is short, in which case the - // TernaryLogic node element size would have been normalized. - // However, there is code, such as for double->int conversions, that generates a 'double' - // base type mask for a TernaryLogic node with base type 'int'. That works because it only - // cares about the '0' elem, as the result will be cast to scalar. - var_types conditionBaseType = condition->AsHWIntrinsic()->GetSimdBaseType(); - uint32_t conditionElemSize = genTypeSize(conditionBaseType); - uint32_t elemSize = genTypeSize(simdBaseType); - if (varTypeIsSmall(conditionBaseType) && (conditionElemSize < elemSize)) + if (!condition->OperIsHWIntrinsic()) { - CorInfoType simdBaseJitTypeCondition = condition->AsHWIntrinsic()->GetSimdBaseJitType(); - node->AsHWIntrinsic()->SetSimdBaseJitType(simdBaseJitTypeCondition); + break; } + node->SetSimdBaseJitType(condition->AsHWIntrinsic()->GetSimdBaseJitType()); + node->ResetHWIntrinsicId(NI_EVEX_BlendVariableMask, comp, selectFalse, selectTrue, condition); BlockRange().Remove(op4); break;