From 1c18b3290b825e66e973e147eda8c7cca3e539c6 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 6 Dec 2018 17:15:03 -0800 Subject: [PATCH] Moving CreateScalarUnsafe, ToScalar, Vector128.ToVector256Unsafe, and Vector256.GetLower to be intrinsics (#21351) * Moving CreateScalarUnsafe, ToScalar, Vector128.ToVector256Unsafe, and Vector256.GetLower to be intrinsics * Adding containment support to the helper intrinsics --- .../System/Runtime/Intrinsics/Vector128.cs | 11 ++ .../System/Runtime/Intrinsics/Vector128_1.cs | 3 + .../System/Runtime/Intrinsics/Vector256.cs | 11 ++ .../System/Runtime/Intrinsics/Vector256_1.cs | 2 + src/jit/compiler.h | 5 +- src/jit/hwintrinsiccodegenxarch.cpp | 106 +++++++++-- src/jit/hwintrinsiclistxarch.h | 11 +- src/jit/importer.cpp | 171 +++++++++++++++++- src/jit/lsraxarch.cpp | 54 ++++++ 9 files changed, 349 insertions(+), 25 deletions(-) diff --git a/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector128.cs b/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector128.cs index 41f069f753ac..445ccb93109b 100644 --- a/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector128.cs +++ b/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector128.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Runtime.CompilerServices; using Internal.Runtime.CompilerServices; namespace System.Runtime.Intrinsics @@ -707,6 +708,7 @@ public static unsafe Vector128 CreateScalar(ulong value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector128 CreateScalarUnsafe(byte value) { // This relies on us stripping the "init" flag from the ".locals" @@ -720,6 +722,7 @@ public static unsafe Vector128 CreateScalarUnsafe(byte value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector128 CreateScalarUnsafe(double value) { // This relies on us stripping the "init" flag from the ".locals" @@ -733,6 +736,7 @@ public static unsafe Vector128 CreateScalarUnsafe(double value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector128 CreateScalarUnsafe(short value) { // This relies on us stripping the "init" flag from the ".locals" @@ -746,6 +750,7 @@ public static unsafe Vector128 CreateScalarUnsafe(short value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector128 CreateScalarUnsafe(int value) { // This relies on us stripping the "init" flag from the ".locals" @@ -759,6 +764,7 @@ public static unsafe Vector128 CreateScalarUnsafe(int value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector128 CreateScalarUnsafe(long value) { // This relies on us stripping the "init" flag from the ".locals" @@ -772,6 +778,7 @@ public static unsafe Vector128 CreateScalarUnsafe(long value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector128 CreateScalarUnsafe(sbyte value) { @@ -786,6 +793,7 @@ public static unsafe Vector128 CreateScalarUnsafe(sbyte value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector128 CreateScalarUnsafe(float value) { // This relies on us stripping the "init" flag from the ".locals" @@ -799,6 +807,7 @@ public static unsafe Vector128 CreateScalarUnsafe(float value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector128 CreateScalarUnsafe(ushort value) { @@ -813,6 +822,7 @@ public static unsafe Vector128 CreateScalarUnsafe(ushort value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector128 CreateScalarUnsafe(uint value) { @@ -827,6 +837,7 @@ public static unsafe Vector128 CreateScalarUnsafe(uint value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector128 CreateScalarUnsafe(ulong value) { diff --git a/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector128_1.cs b/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector128_1.cs index 95f217d64e27..3c15de6aaa0a 100644 --- a/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector128_1.cs +++ b/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector128_1.cs @@ -303,6 +303,7 @@ public Vector128 WithUpper(Vector64 value) /// Converts the current instance to a scalar containing the value of the first element. /// A scalar containing the value of the first element. /// The type of the current instance () is not supported. + [Intrinsic] public T ToScalar() { ThrowIfUnsupportedType(); @@ -356,6 +357,7 @@ public string ToString(string format, IFormatProvider formatProvider) /// Converts the current instance to a new with the lower 128-bits set to the value of the current instance and the upper 128-bits initialized to zero. /// A new with the lower 128-bits set to the value of the current instance and the upper 128-bits initialized to zero. /// The type of the current instance () is not supported. + [Intrinsic] public Vector256 ToVector256() { ThrowIfUnsupportedType(); @@ -369,6 +371,7 @@ public Vector256 ToVector256() /// Converts the current instance to a new with the lower 128-bits set to the value of the current instance and the upper 128-bits left uninitialized. /// A new with the lower 128-bits set to the value of the current instance and the upper 128-bits left uninitialized. /// The type of the current instance () is not supported. + [Intrinsic] public unsafe Vector256 ToVector256Unsafe() { ThrowIfUnsupportedType(); diff --git a/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector256.cs b/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector256.cs index 6863a869c4ff..d22e1c092c13 100644 --- a/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector256.cs +++ b/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector256.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.Runtime.CompilerServices; using Internal.Runtime.CompilerServices; namespace System.Runtime.Intrinsics @@ -905,6 +906,7 @@ public static unsafe Vector256 CreateScalar(ulong value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector256 CreateScalarUnsafe(byte value) { // This relies on us stripping the "init" flag from the ".locals" @@ -918,6 +920,7 @@ public static unsafe Vector256 CreateScalarUnsafe(byte value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector256 CreateScalarUnsafe(double value) { // This relies on us stripping the "init" flag from the ".locals" @@ -931,6 +934,7 @@ public static unsafe Vector256 CreateScalarUnsafe(double value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector256 CreateScalarUnsafe(short value) { // This relies on us stripping the "init" flag from the ".locals" @@ -944,6 +948,7 @@ public static unsafe Vector256 CreateScalarUnsafe(short value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector256 CreateScalarUnsafe(int value) { // This relies on us stripping the "init" flag from the ".locals" @@ -957,6 +962,7 @@ public static unsafe Vector256 CreateScalarUnsafe(int value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector256 CreateScalarUnsafe(long value) { // This relies on us stripping the "init" flag from the ".locals" @@ -970,6 +976,7 @@ public static unsafe Vector256 CreateScalarUnsafe(long value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 CreateScalarUnsafe(sbyte value) { @@ -984,6 +991,7 @@ public static unsafe Vector256 CreateScalarUnsafe(sbyte value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] public static unsafe Vector256 CreateScalarUnsafe(float value) { // This relies on us stripping the "init" flag from the ".locals" @@ -997,6 +1005,7 @@ public static unsafe Vector256 CreateScalarUnsafe(float value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 CreateScalarUnsafe(ushort value) { @@ -1011,6 +1020,7 @@ public static unsafe Vector256 CreateScalarUnsafe(ushort value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 CreateScalarUnsafe(uint value) { @@ -1025,6 +1035,7 @@ public static unsafe Vector256 CreateScalarUnsafe(uint value) /// Creates a new instance with the first element initialized to the specified value and the remaining elements left uninitialized. /// The value that element 0 will be initialized to. /// A new instance with the first element initialized to and the remaining elements left uninitialized. + [Intrinsic] [CLSCompliant(false)] public static unsafe Vector256 CreateScalarUnsafe(ulong value) { diff --git a/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector256_1.cs b/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector256_1.cs index fe0fc219c277..dd18a4cde9db 100644 --- a/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector256_1.cs +++ b/src/System.Private.CoreLib/shared/System/Runtime/Intrinsics/Vector256_1.cs @@ -254,6 +254,7 @@ public override int GetHashCode() /// Gets the value of the lower 128-bits as a new . /// The value of the lower 128-bits as a new . /// The type of the current instance () is not supported. + [Intrinsic] public Vector128 GetLower() { ThrowIfUnsupportedType(); @@ -305,6 +306,7 @@ public Vector256 WithUpper(Vector128 value) /// Converts the current instance to a scalar containing the value of the first element. /// A scalar containing the value of the first element. /// The type of the current instance () is not supported. + [Intrinsic] public T ToScalar() { ThrowIfUnsupportedType(); diff --git a/src/jit/compiler.h b/src/jit/compiler.h index df5578f16635..9c9cf8229efc 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -3411,7 +3411,10 @@ class Compiler NamedIntrinsic lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method); #ifdef FEATURE_HW_INTRINSICS - GenTree* impBaseIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig); + GenTree* impBaseIntrinsic(NamedIntrinsic intrinsic, + CORINFO_CLASS_HANDLE clsHnd, + CORINFO_METHOD_HANDLE method, + CORINFO_SIG_INFO* sig); GenTree* impHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig, diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index 320c9fbba7fe..8f2bc3abcdc5 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -1254,29 +1254,111 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) var_types targetType = node->TypeGet(); var_types baseType = node->gtSIMDBaseType; - assert(node->gtGetOp1() == nullptr); + assert(compiler->compSupports(InstructionSet_SSE)); + assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE)); + + GenTree* op1 = node->gtGetOp1(); + regNumber op1Reg = REG_NA; + + if (op1 != nullptr) + { + assert(!op1->OperIsList()); + op1Reg = op1->gtRegNum; + genConsumeOperands(node); + } + assert(node->gtGetOp2() == nullptr); - assert(baseType >= TYP_BYTE && baseType <= TYP_DOUBLE); - emitter* emit = getEmitter(); - emitAttr attr = EA_ATTR(node->gtSIMDSize); + emitter* emit = getEmitter(); + emitAttr attr = EA_ATTR(node->gtSIMDSize); + instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType); switch (intrinsicId) { - case NI_Base_Vector128_Zero: + case NI_Base_Vector128_CreateScalarUnsafe: + case NI_Base_Vector256_CreateScalarUnsafe: { - // When SSE2 is supported, we generate pxor for integral types otherwise just use xorps - instruction ins = - (compiler->compSupports(InstructionSet_SSE2) && varTypeIsIntegral(baseType)) ? INS_pxor : INS_xorps; - emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg); + if (varTypeIsIntegral(baseType)) + { + genHWIntrinsic_R_RM(node, ins, emitActualTypeSize(baseType)); + } + else + { + assert(varTypeIsFloating(baseType)); + + attr = emitTypeSize(baseType); + + if (op1->isContained() || op1->isUsedFromSpillTemp()) + { + genHWIntrinsic_R_RM(node, ins, attr); + } + else if (targetReg != op1Reg) + { + // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs + emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); + } + } break; } + case NI_Base_Vector128_ToScalar: + case NI_Base_Vector256_ToScalar: + { + assert(varTypeIsFloating(baseType)); + + attr = emitTypeSize(TYP_SIMD16); + + if (op1->isContained() || op1->isUsedFromSpillTemp()) + { + genHWIntrinsic_R_RM(node, ins, attr); + } + else if (targetReg != op1Reg) + { + // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs + emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); + } + break; + } + + case NI_Base_Vector128_ToVector256: + { + // ToVector256 has zero-extend semantics in order to ensure it is deterministic + // We always emit a move to the target register, even when op1Reg == targetReg, + // in order to ensure that Bits MAXVL-1:128 are zeroed. + + attr = emitTypeSize(TYP_SIMD16); + + if (op1->isContained() || op1->isUsedFromSpillTemp()) + { + genHWIntrinsic_R_RM(node, ins, attr); + } + else + { + // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs + emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); + } + break; + } + + case NI_Base_Vector128_ToVector256Unsafe: + case NI_Base_Vector256_GetLower: + { + if (op1->isContained() || op1->isUsedFromSpillTemp()) + { + genHWIntrinsic_R_RM(node, ins, attr); + } + else if (targetReg != op1Reg) + { + // Just use movaps for reg->reg moves as it has zero-latency on modern CPUs + emit->emitIns_R_R(INS_movaps, attr, targetReg, op1Reg); + } + break; + } + + case NI_Base_Vector128_Zero: case NI_Base_Vector256_Zero: { - // When AVX2 is supported, we generate pxor for integral types otherwise just use xorps - instruction ins = - (compiler->compSupports(InstructionSet_AVX2) && varTypeIsIntegral(baseType)) ? INS_pxor : INS_xorps; + assert(op1 == nullptr); emit->emitIns_SIMD_R_R_R(ins, attr, targetReg, targetReg, targetReg); break; } diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index 75de24b8aa12..68cfda59f347 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -38,7 +38,11 @@ HARDWARE_INTRINSIC(Base_Vector128_AsSingle, "AsSingle", HARDWARE_INTRINSIC(Base_Vector128_AsUInt16, "AsUInt16", Base, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Base_Vector128_AsUInt32, "AsUInt32", Base, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Base_Vector128_AsUInt64, "AsUInt64", Base, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(Base_Vector128_Zero, "get_Zero", Base, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector128_CreateScalarUnsafe, "CreateScalarUnsafe", Base, -1, 16, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector128_ToScalar, "ToScalar", Base, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector128_ToVector256, "ToVector256", Base, -1, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector128_ToVector256Unsafe, "ToVector256Unsafe", Base, -1, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector128_Zero, "get_Zero", Base, -1, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Base_Vector256_As, "As`1", Base, -1, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Base_Vector256_AsByte, "AsByte", Base, -1, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Base_Vector256_AsDouble, "AsDouble", Base, -1, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) @@ -50,7 +54,10 @@ HARDWARE_INTRINSIC(Base_Vector256_AsSingle, "AsSingle", HARDWARE_INTRINSIC(Base_Vector256_AsUInt16, "AsUInt16", Base, -1, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Base_Vector256_AsUInt32, "AsUInt32", Base, -1, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Base_Vector256_AsUInt64, "AsUInt64", Base, -1, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(Base_Vector256_Zero, "get_Zero", Base, -1, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector256_CreateScalarUnsafe, "CreateScalarUnsafe", Base, -1, 32, 1, {INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_mov_i2xmm, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector256_GetLower, "GetLower", Base, -1, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector256_ToScalar, "ToScalar", Base, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Base_Vector256_Zero, "get_Zero", Base, -1, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags diff --git a/src/jit/importer.cpp b/src/jit/importer.cpp index 0ddf3b34ac80..96badba0f85f 100644 --- a/src/jit/importer.cpp +++ b/src/jit/importer.cpp @@ -3448,6 +3448,10 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis, case NI_Base_Vector128_AsUInt32: case NI_Base_Vector128_AsUInt64: #if defined(_TARGET_XARCH_) + case NI_Base_Vector128_CreateScalarUnsafe: + case NI_Base_Vector128_ToScalar: + case NI_Base_Vector128_ToVector256: + case NI_Base_Vector128_ToVector256Unsafe: case NI_Base_Vector128_Zero: case NI_Base_Vector256_As: case NI_Base_Vector256_AsByte: @@ -3460,10 +3464,13 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis, case NI_Base_Vector256_AsUInt16: case NI_Base_Vector256_AsUInt32: case NI_Base_Vector256_AsUInt64: + case NI_Base_Vector256_CreateScalarUnsafe: + case NI_Base_Vector256_GetLower: + case NI_Base_Vector256_ToScalar: case NI_Base_Vector256_Zero: #endif // _TARGET_XARCH_ { - return impBaseIntrinsic(ni, method, sig); + return impBaseIntrinsic(ni, clsHnd, method, sig); } default: @@ -4101,15 +4108,20 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis, // // Arguments: // intrinsic -- id of the intrinsic function. +// clsHnd -- handle for the intrinsic method's class // method -- method handle of the intrinsic function. // sig -- signature of the intrinsic call // // Return Value: // the expanded intrinsic. // -GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HANDLE method, CORINFO_SIG_INFO* sig) +GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, + CORINFO_CLASS_HANDLE clsHnd, + CORINFO_METHOD_HANDLE method, + CORINFO_SIG_INFO* sig) { GenTree* retNode = nullptr; + GenTree* op1 = nullptr; if (!featureSIMD) { @@ -4117,19 +4129,26 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HAN } unsigned simdSize = 0; - var_types baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeClass, &simdSize); - var_types retType = getSIMDTypeForSize(simdSize); + var_types baseType = TYP_UNKNOWN; + var_types retType = JITtype2varType(sig->retType); if (sig->hasThis()) { - CORINFO_CLASS_HANDLE thisClass = info.compCompHnd->getArgClass(sig, sig->args); - var_types thisType = getBaseTypeOfSIMDType(thisClass); + baseType = getBaseTypeAndSizeOfSIMDType(clsHnd, &simdSize); - if (!varTypeIsArithmetic(thisType)) + if (retType == TYP_STRUCT) { - return nullptr; + unsigned retSimdSize = 0; + getBaseTypeAndSizeOfSIMDType(sig->retTypeClass, &retSimdSize); + retType = getSIMDTypeForSize(retSimdSize); } } + else + { + assert(retType == TYP_STRUCT); + baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeClass, &simdSize); + retType = getSIMDTypeForSize(simdSize); + } if (!varTypeIsArithmetic(baseType)) { @@ -4186,6 +4205,56 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HAN } #ifdef _TARGET_XARCH_ + case NI_Base_Vector128_CreateScalarUnsafe: + { + assert(sig->numArgs == 1); + +#ifdef _TARGET_X86_ + if (varTypeIsLong(baseType)) + { + // TODO-XARCH-CQ: It may be beneficial to emit the movq + // instruction, which takes a 64-bit memory address and + // works on 32-bit x86 systems. + break; + } +#endif // _TARGET_X86_ + + if (compSupports(InstructionSet_SSE2) || (compSupports(InstructionSet_SSE) && (baseType == TYP_FLOAT))) + { + op1 = impPopStack().val; + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + } + break; + } + + case NI_Base_Vector128_ToScalar: + { + assert(sig->numArgs == 0); + assert(sig->hasThis()); + + if (compSupports(InstructionSet_SSE) && varTypeIsFloating(baseType)) + { + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize), true, clsHnd); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, 16); + } + break; + } + + case NI_Base_Vector128_ToVector256: + case NI_Base_Vector128_ToVector256Unsafe: + case NI_Base_Vector256_GetLower: + { + assert(sig->numArgs == 0); + assert(sig->hasThis()); + + if (compSupports(InstructionSet_AVX)) + { + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize), true, clsHnd); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + } + break; + } + case NI_Base_Vector128_Zero: { assert(sig->numArgs == 0); @@ -4197,6 +4266,41 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, CORINFO_METHOD_HAN break; } + case NI_Base_Vector256_CreateScalarUnsafe: + { + assert(sig->numArgs == 1); + +#ifdef _TARGET_X86_ + if (varTypeIsLong(baseType)) + { + // TODO-XARCH-CQ: It may be beneficial to emit the movq + // instruction, which takes a 64-bit memory address and + // works on 32-bit x86 systems. + break; + } +#endif // _TARGET_X86_ + + if (compSupports(InstructionSet_AVX)) + { + op1 = impPopStack().val; + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + } + break; + } + + case NI_Base_Vector256_ToScalar: + { + assert(sig->numArgs == 0); + assert(sig->hasThis()); + + if (compSupports(InstructionSet_AVX) && varTypeIsFloating(baseType)) + { + op1 = impSIMDPopStack(getSIMDTypeForSize(simdSize), true, clsHnd); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, 32); + } + break; + } + case NI_Base_Vector256_Zero: { assert(sig->numArgs == 0); @@ -4419,7 +4523,17 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method) { className += 3; - if (strcmp(className, "`1") == 0) +#if defined(_TARGET_XARCH_) + if (className[0] == '\0') + { + if (strcmp(methodName, "CreateScalarUnsafe") == 0) + { + result = NI_Base_Vector128_CreateScalarUnsafe; + } + } + else +#endif // _TARGET_XARCH_ + if (strcmp(className, "`1") == 0) { if (strncmp(methodName, "As", 2) == 0) { @@ -4475,6 +4589,28 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method) { result = NI_Base_Vector128_Zero; } + else if (strncmp(methodName, "To", 2) == 0) + { + methodName += 2; + + if (strcmp(methodName, "Scalar") == 0) + { + result = NI_Base_Vector128_ToScalar; + } + else if (strncmp(methodName, "Vector256", 9) == 0) + { + methodName += 9; + + if (methodName[0] == '\0') + { + result = NI_Base_Vector128_ToVector256; + } + else if (strcmp(methodName, "Unsafe") == 0) + { + result = NI_Base_Vector128_ToVector256Unsafe; + } + } + } #endif // _TARGET_XARCH_ } } @@ -4483,7 +4619,14 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method) { className += 3; - if (strcmp(className, "`1") == 0) + if (className[0] == '\0') + { + if (strcmp(methodName, "CreateScalarUnsafe") == 0) + { + result = NI_Base_Vector256_CreateScalarUnsafe; + } + } + else if (strcmp(className, "`1") == 0) { if (strncmp(methodName, "As", 2) == 0) { @@ -4538,6 +4681,14 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method) { result = NI_Base_Vector256_Zero; } + else if (strcmp(methodName, "GetLower") == 0) + { + result = NI_Base_Vector256_GetLower; + } + else if (strcmp(methodName, "ToScalar") == 0) + { + result = NI_Base_Vector256_ToScalar; + } } } #endif // _TARGET_XARCH_ diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index c5cc71e22268..774334c032b2 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2372,6 +2372,60 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) // must be handled within the case. switch (intrinsicId) { + case NI_Base_Vector128_CreateScalarUnsafe: + case NI_Base_Vector128_ToScalar: + case NI_Base_Vector256_CreateScalarUnsafe: + case NI_Base_Vector256_ToScalar: + { + assert(numArgs == 1); + + if (varTypeIsFloating(baseType)) + { + if (op1->isContained()) + { + srcCount += BuildOperandUses(op1); + } + else + { + // We will either be in memory and need to be moved + // into a register of the appropriate size or we + // are already in an XMM/YMM register and can stay + // where we are. + + tgtPrefUse = BuildUse(op1); + srcCount += 1; + } + + buildUses = false; + } + break; + } + + case NI_Base_Vector128_ToVector256: + case NI_Base_Vector128_ToVector256Unsafe: + case NI_Base_Vector256_GetLower: + { + assert(numArgs == 1); + + if (op1->isContained()) + { + srcCount += BuildOperandUses(op1); + } + else + { + // We will either be in memory and need to be moved + // into a register of the appropriate size or we + // are already in an XMM/YMM register and can stay + // where we are. + + tgtPrefUse = BuildUse(op1); + srcCount += 1; + } + + buildUses = false; + break; + } + case NI_SSE_CompareEqualOrderedScalar: case NI_SSE_CompareEqualUnorderedScalar: case NI_SSE_CompareNotEqualOrderedScalar: