From dc206c3fff662de83fbfd297be9fb58cb0be4fcf Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 4 Oct 2021 15:51:22 -0700 Subject: [PATCH 1/5] Moving Narrow to implemented using SIMDAsHWIntrinsic --- src/coreclr/jit/codegen.h | 1 - src/coreclr/jit/codegenarm64.cpp | 86 ---- src/coreclr/jit/compiler.h | 7 + src/coreclr/jit/gentree.cpp | 409 +++++++++++++++++++ src/coreclr/jit/hwintrinsicarm64.cpp | 6 +- src/coreclr/jit/hwintrinsicxarch.cpp | 10 +- src/coreclr/jit/lsraarm64.cpp | 11 - src/coreclr/jit/lsraxarch.cpp | 10 - src/coreclr/jit/simd.cpp | 12 - src/coreclr/jit/simdashwintrinsic.cpp | 16 +- src/coreclr/jit/simdashwintrinsic.h | 9 + src/coreclr/jit/simdashwintrinsiclistarm64.h | 1 + src/coreclr/jit/simdashwintrinsiclistxarch.h | 2 + src/coreclr/jit/simdcodegenxarch.cpp | 178 -------- src/coreclr/jit/simdintrinsiclist.h | 2 - 15 files changed, 457 insertions(+), 303 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index c6b789f86f3cb..3c4c1da158196 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -967,7 +967,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX regNumber targetReg); void genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode); void genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode); - void genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode); void genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg); void genSIMDIntrinsicWiden(GenTreeSIMD* simdNode); void genSIMDIntrinsic(GenTreeSIMD* simdNode); diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index d68da1a1e2a9b..f4fb89fe35000 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3899,10 +3899,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicWiden(simdNode); break; - case SIMDIntrinsicNarrow: - genSIMDIntrinsicNarrow(simdNode); - break; - case SIMDIntrinsicSub: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: @@ -3991,11 +3987,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type case SIMDIntrinsicEqual: result = INS_fcmeq; break; - case SIMDIntrinsicNarrow: - // Use INS_fcvtn lower bytes of result followed by INS_fcvtn2 for upper bytes - // Return lower bytes instruction here - result = INS_fcvtn; - break; case SIMDIntrinsicSub: result = INS_fsub; break; @@ -4032,11 +4023,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type case SIMDIntrinsicEqual: result = INS_cmeq; break; - case SIMDIntrinsicNarrow: - // Use INS_xtn lower bytes of result followed by INS_xtn2 for upper bytes - // Return lower bytes instruction here - result = INS_xtn; - break; case SIMDIntrinsicSub: result = INS_sub; break; @@ -4259,78 +4245,6 @@ void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) genProduceReg(simdNode); } -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Notes: -// This intrinsic takes two arguments. The first operand is narrowed to produce the -// lower elements of the results, and the second operand produces the high elements. -// -void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) -{ - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow); - - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types simdType = simdNode->TypeGet(); - emitAttr emitSize = emitTypeSize(simdType); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - - assert(genIsValidFloatReg(op1Reg)); - assert(genIsValidFloatReg(op2Reg)); - assert(genIsValidFloatReg(targetReg)); - assert(op2Reg != targetReg); - assert(simdNode->GetSimdSize() == 16); - - instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - assert((ins == INS_fcvtn) || (ins == INS_xtn)); - - instruction ins2 = (ins == INS_fcvtn) ? INS_fcvtn2 : INS_xtn2; - - insOpts opt = INS_OPTS_NONE; - insOpts opt2 = INS_OPTS_NONE; - - // This is not the same as genGetSimdInsOpt() - // Basetype is the soure operand type - // However encoding is based on the destination operand type which is 1/2 the basetype. - switch (baseType) - { - case TYP_ULONG: - case TYP_LONG: - case TYP_DOUBLE: - opt = INS_OPTS_2S; - opt2 = INS_OPTS_4S; - break; - case TYP_UINT: - case TYP_INT: - opt = INS_OPTS_4H; - opt2 = INS_OPTS_8H; - break; - case TYP_USHORT: - case TYP_SHORT: - opt = INS_OPTS_8B; - opt2 = INS_OPTS_16B; - break; - default: - assert(!"Unsupported narrowing element type"); - unreached(); - } - - GetEmitter()->emitIns_R_R(ins, EA_8BYTE, targetReg, op1Reg, opt); - GetEmitter()->emitIns_R_R(ins2, EA_16BYTE, targetReg, op2Reg, opt2); - - genProduceReg(simdNode); -} - //-------------------------------------------------------------------------------- // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations // add, sub, mul, bit-wise And, AndNot and Or. diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index cd46b491ea95e..63511d1c723d5 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3262,6 +3262,13 @@ class Compiler unsigned simdSize, bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdNarrowNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdSqrtNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic); diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 90093f9e7bba6..49e9c22dd9788 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20911,6 +20911,415 @@ GenTree* Compiler::gtNewSimdMinNode(var_types type, return gtNewSimdCndSelNode(type, op1, op1Dup, op2Dup, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } +GenTree* Compiler::gtNewSimdNarrowNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic) +{ + assert(IsBaselineSimdIsaSupportedDebugOnly()); + + assert(varTypeIsSIMD(type)); + assert(getSIMDTypeForSize(simdSize) == type); + + assert(op1 != nullptr); + assert(op1->TypeIs(type)); + + assert(op2 != nullptr); + assert(op2->TypeIs(type)); + + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType)); + + NamedIntrinsic intrinsic = NI_Illegal; + + GenTree* tmp1; + GenTree* tmp2; + +#if defined(TARGET_XARCH) + GenTree* tmp3; + GenTree* tmp4; + + if (simdSize == 32) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + // This is the same in principle to the other comments below, however due to + // code formatting, its too long to reasonably display here. + + CorInfoType opBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_SHORT : CORINFO_TYPE_USHORT; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + tmp1 = gtNewSimdHWIntrinsicNode(type, gtNewIconNode(0x00FF), NI_Vector256_Create, opBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + tmp2 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, op2, tmp1Dup, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp4 = gtNewSimdHWIntrinsicNode(type, tmp2, tmp3, NI_SSE2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, + simdSize, isSimdAsHWIntrinsic); + + CorInfoType permuteBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + return gtNewSimdHWIntrinsicNode(type, tmp4, gtNewIconNode(SHUFFLE_WYZX), NI_AVX2_Permute4x64, + permuteBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + case TYP_SHORT: + case TYP_USHORT: + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + // op1 = Elements 0L, 0U, 1L, 1U, 2L, 2U, 3L, 3U | 4L, 4U, 5L, 5U, 6L, 6U, 7L, 7U + // op2 = Elements 8L, 8U, 9L, 9U, AL, AU, BL, BU | CL, CU, DL, DU, EL, EU, FL, FU + // + // tmp2 = Elements 0L, --, 1L, --, 2L, --, 3L, -- | 4L, --, 5L, --, 6L, --, 7L, -- + // tmp3 = Elements 8L, --, 9L, --, AL, --, BL, -- | CL, --, DL, --, EL, --, FL, -- + // tmp4 = Elements 0L, 1L, 2L, 3L, 8L, 9L, AL, BL | 4L, 5L, 6L, 7L, CL, DL, EL, FL + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L | 8L, 9L, AL, BL, CL, DL, EL, FL + // + // var tmp1 = Vector256.Create(0x0000FFFF).AsInt16(); + // var tmp2 = Avx2.And(op1.AsInt16(), tmp1); + // var tmp3 = Avx2.And(op2.AsInt16(), tmp1); + // var tmp4 = Avx2.PackUnsignedSaturate(tmp2, tmp3); + // return Avx2.Permute4x64(tmp4.AsUInt64(), SHUFFLE_WYZX).As(); + + CorInfoType opBaseJitType = (simdBaseType == TYP_SHORT) ? CORINFO_TYPE_INT : CORINFO_TYPE_UINT; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + tmp1 = gtNewSimdHWIntrinsicNode(type, gtNewIconNode(0x0000FFFF), NI_Vector256_Create, opBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + tmp2 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, op2, tmp1Dup, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp4 = gtNewSimdHWIntrinsicNode(type, tmp2, tmp3, NI_SSE41_PackUnsignedSaturate, CORINFO_TYPE_USHORT, + simdSize, isSimdAsHWIntrinsic); + + CorInfoType permuteBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + return gtNewSimdHWIntrinsicNode(type, tmp4, gtNewIconNode(SHUFFLE_WYZX), NI_AVX2_Permute4x64, + permuteBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + case TYP_INT: + case TYP_UINT: + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + // op1 = Elements 0, 1 | 2, 3; 0L, 0U, 1L, 1U | 2L, 2U, 3L, 3U + // op2 = Elements 4, 5 | 6, 7; 4L, 4U, 5L, 5U | 6L, 6U, 7L, 7U + // + // tmp1 = Elements 0L, 4L, 0U, 4U | 2L, 6L, 2U, 6U + // tmp2 = Elements 1L, 5L, 1U, 5U | 3L, 7L, 3U, 7U + // tmp3 = Elements 0L, 1L, 4L, 5L | 2L, 3L, 6L, 7L + // return Elements 0L, 1L, 2L, 3L | 4L, 5L, 6L, 7L + // + // var tmp1 = Avx2.UnpackLow(op1, op2); + // var tmp2 = Avx2.UnpackHigh(op1, op2); + // var tmp3 = Avx2.UnpackLow(tmp1, tmp2); + // return Avx2.Permute4x64(tmp3.AsUInt64(), SHUFFLE_WYZX).AsUInt32(); + + CorInfoType opBaseJitType = (simdBaseType == TYP_INT) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector narrow")); + + GenTree* op2Dup; + op2 = impCloneExpr(op2, &op2Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op2 for vector narrow")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(type, op1Dup, op2Dup, NI_AVX2_UnpackHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_AVX2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp3, gtNewIconNode(SHUFFLE_WYZX), NI_AVX2_Permute4x64, + opBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + case TYP_FLOAT: + { + // op1 = Elements 0, 1 | 2, 3 + // op2 = Elements 4, 5 | 6, 7 + // + // tmp1 = Elements 0, 1, 2, 3 | -, -, -, - + // tmp1 = Elements 4, 5, 6, 7 + // return Elements 0, 1, 2, 3 | 4, 5, 6, 7 + // + // var tmp1 = Avx.ConvertToVector128Single(op1).ToVector256Unsafe(); + // var tmp2 = Avx.ConvertToVector128Single(op2); + // return Avx.InsertVector128(tmp1, tmp2, 1); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AVX_ConvertToVector128Single, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_AVX_ConvertToVector128Single, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_ToVector256Unsafe, simdBaseJitType, 16, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, gtNewIconNode(1), NI_AVX_InsertVector128, + simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + default: + { + unreached(); + } + } + } + else + { + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + // op1 = Elements 0, 1, 2, 3, 4, 5, 6, 7; 0L, 0U, 1L, 1U, 2L, 2U, 3L, 3U, 4L, 4U, 5L, 5U, 6L, 6U, 7L, 7U + // op2 = Elements 8, 9, A, B, C, D, E, F; 8L, 8U, 9L, 9U, AL, AU, BL, BU, CL, CU, DL, DU, EL, EU, FL, FU + // + // tmp2 = Elements 0L, --, 1L, --, 2L, --, 3L, --, 4L, --, 5L, --, 6L, --, 7L, -- + // tmp3 = Elements 8L, --, 9L, --, AL, --, BL, --, CL, --, DL, --, EL, --, FL, -- + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, AL, BL, CL, DL, EL, FL + // + // var tmp1 = Vector128.Create((ushort)(0x00FF)).AsSByte(); + // var tmp2 = Sse2.And(op1.AsSByte(), tmp1); + // var tmp3 = Sse2.And(op2.AsSByte(), tmp1); + // return Sse2.PackUnsignedSaturate(tmp1, tmp2).As(); + + CorInfoType opBaseJitType = (simdBaseType == TYP_BYTE) ? CORINFO_TYPE_SHORT : CORINFO_TYPE_USHORT; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + tmp1 = gtNewSimdHWIntrinsicNode(type, gtNewIconNode(0x00FF), NI_Vector128_Create, opBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + tmp2 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, op2, tmp1Dup, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp2, tmp3, NI_SSE2_PackUnsignedSaturate, CORINFO_TYPE_UBYTE, + simdSize, isSimdAsHWIntrinsic); + } + + case TYP_SHORT: + case TYP_USHORT: + { + // op1 = Elements 0, 1, 2, 3; 0L, 0U, 1L, 1U, 2L, 2U, 3L, 3U + // op2 = Elements 4, 5, 6, 7; 4L, 4U, 5L, 5U, 6L, 6U, 7L, 7U + // + // ... + + CorInfoType opBaseJitType = (simdBaseType == TYP_SHORT) ? CORINFO_TYPE_INT : CORINFO_TYPE_UINT; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // ... + // + // tmp2 = Elements 0L, --, 1L, --, 2L, --, 3L, -- + // tmp3 = Elements 4L, --, 5L, --, 6L, --, 7L, -- + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L + // + // var tmp1 = Vector128.Create(0x0000FFFF).AsInt16(); + // var tmp2 = Sse2.And(op1.AsInt16(), tmp1); + // var tmp3 = Sse2.And(op2.AsInt16(), tmp1); + // return Sse2.PackUnsignedSaturate(tmp2, tmp3).As(); + + tmp1 = gtNewSimdHWIntrinsicNode(type, gtNewIconNode(0x0000FFFF), NI_Vector128_Create, opBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + tmp2 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp3 = gtNewSimdHWIntrinsicNode(type, op2, tmp1Dup, NI_SSE2_And, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp2, tmp3, NI_SSE41_PackUnsignedSaturate, + CORINFO_TYPE_USHORT, simdSize, isSimdAsHWIntrinsic); + } + else + { + // ... + // + // tmp1 = Elements 0L, 4L, 0U, 4U, 1L, 5L, 1U, 5U + // tmp2 = Elements 2L, 6L, 2U, 6U, 3L, 7L, 3U, 7U + // tmp3 = Elements 0L, 2L, 4L, 6L, 0U, 2U, 4U, 6U + // tmp4 = Elements 1L, 3L, 5L, 7L, 1U, 3U, 5U, 7U + // return Elements 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L + // + // var tmp1 = Sse2.UnpackLow(op1.AsUInt16(), op2.AsUInt16()); + // var tmp2 = Sse2.UnpackHigh(op1.AsUInt16(), op2.AsUInt16()); + // var tmp3 = Sse2.UnpackLow(tmp1, tmp2); + // var tmp4 = Sse2.UnpackHigh(tmp1, tmp2); + // return Sse2.UnpackLow(tmp3, tmp4).As(); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector narrow")); + + GenTree* op2Dup; + op2 = impCloneExpr(op2, &op2Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector narrow")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(type, op1Dup, op2Dup, NI_SSE2_UnpackHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType); + + GenTree* tmp1Dup; + tmp1 = impCloneExpr(tmp1, &tmp1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp1 for vector narrow")); + + GenTree* tmp2Dup; + tmp2 = impCloneExpr(tmp2, &tmp2Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone tmp2 for vector narrow")); + + tmp3 = gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp4 = gtNewSimdHWIntrinsicNode(type, tmp1Dup, tmp2Dup, NI_SSE2_UnpackHigh, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp3, tmp4, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + } + + case TYP_INT: + case TYP_UINT: + { + // op1 = Elements 0, 1; 0L, 0U, 1L, 1U + // op2 = Elements 2, 3; 2L, 2U, 3L, 3U + // + // tmp1 = Elements 0L, 2L, 0U, 2U + // tmp2 = Elements 1L, 3L, 1U, 3U + // return Elements 0L, 1L, 2L, 3L + // + // var tmp1 = Sse2.UnpackLow(op1.AsUInt32(), op2.AsUInt32()); + // var tmp2 = Sse2.UnpackHigh(op1.AsUInt32(), op2.AsUInt32()); + // return Sse2.UnpackLow(tmp1, tmp2).As(); + + CorInfoType opBaseJitType = (simdBaseType == TYP_INT) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, opBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector narrow")); + + GenTree* op2Dup; + op2 = impCloneExpr(op2, &op2Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op2 for vector narrow")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(type, op1Dup, op2Dup, NI_SSE2_UnpackHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + + case TYP_FLOAT: + { + // op1 = Elements 0, 1 + // op2 = Elements 2, 3 + // + // tmp1 = Elements 0, 1, -, - + // tmp1 = Elements 2, 3, -, - + // return Elements 0, 1, 2, 3 + // + // var tmp1 = Sse2.ConvertToVector128Single(op1); + // var tmp2 = Sse2.ConvertToVector128Single(op2); + // return Sse.MoveLowToHigh(tmp1, tmp2); + + CorInfoType opBaseJitType = CORINFO_TYPE_DOUBLE; + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, NI_SSE2_ConvertToVector128Single, opBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(type, op2, NI_SSE2_ConvertToVector128Single, opBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp1, tmp2, NI_SSE_MoveLowToHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + + default: + { + unreached(); + } + } + } +#elif defined(TARGET_ARM64) + if (varTypeIsFloating(simdBaseType)) + { + // var tmp1 = AdvSimd.Arm64.ConvertToSingleLower(op1); + // return AdvSimd.Arm64.ConvertToSingleUpper(tmp1, op2); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_ConvertToSingleLower, simdBaseJitType, 8, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, op2, NI_AdvSimd_Arm64_ConvertToSingleUpper, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + } + else if (simdSize == 16) + { + // var tmp1 = AdvSimd.ExtractNarrowingLower(op1); + // return AdvSimd.ExtractNarrowingUpper(tmp1, op2); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_ExtractNarrowingLower, simdBaseJitType, 8, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, op2, NI_AdvSimd_ExtractNarrowingUpper, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + else + { + // var tmp1 = op1.ToVector128Unsafe(); + // var tmp2 = AdvSimd.InsertScalar(tmp1.AsUInt64(), 1, op2.AsUInt64()); + // return AdvSimd.ExtractNarrowingUpper(tmp2).As(); + + CorInfoType tmp2BaseJitType = varTypeIsSigned(simdBaseType) ? CORINFO_TYPE_LONG : CORINFO_TYPE_ULONG; + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128Unsafe, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, gtNewIconNode(1), op2, NI_AdvSimd_InsertScalar, + tmp2BaseJitType, 16, isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp2, NI_AdvSimd_ExtractNarrowingLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 +} + GenTree* Compiler::gtNewSimdSqrtNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic) { diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 98795c7941f89..e49fc2c162be2 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -821,7 +821,11 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Narrow: { assert(sig->numArgs == 2); - // TODO-ARM64-CQ: These intrinsics should be accelerated. + + op2 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(retType); + + retNode = gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 46fb8b7c3f86c..0ee97e80f0b9f 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1294,7 +1294,15 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector256_Narrow: { assert(sig->numArgs == 2); - // TODO-XARCH-CQ: These intrinsics should be accelerated + + if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2)) + { + op2 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(retType); + + retNode = + gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + } break; } diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index b7acad960d45e..8a1173ad600d8 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -816,17 +816,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) // No special handling required. break; - case SIMDIntrinsicNarrow: - { - // Op1 will write to dst before Op2 is free - BuildUse(op1); - RefPosition* op2Use = BuildUse(op2); - setDelayFree(op2Use); - srcCount = 2; - buildUses = false; - break; - } - case SIMDIntrinsicInitN: { var_types baseType = simdTree->GetSimdBaseType(); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 358630dc9fa30..d3eec424a91ae 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2018,16 +2018,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) buildInternalIntRegisterDefForNode(simdTree); break; - case SIMDIntrinsicNarrow: - // We need an internal register different from targetReg. - setInternalRegsDelayFree = true; - buildInternalFloatRegisterDefForNode(simdTree); - if ((compiler->getSIMDSupportLevel() == SIMD_AVX2_Supported) && (simdTree->GetSimdBaseType() != TYP_DOUBLE)) - { - buildInternalFloatRegisterDefForNode(simdTree); - } - break; - case SIMDIntrinsicShuffleSSE2: // Second operand is an integer constant and marked as contained. assert(simdTree->gtGetOp2()->isContainedIntOrIImmed()); diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index a3573b6eeb51e..9ba93aa3dad25 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -1204,7 +1204,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in case SIMDIntrinsicConvertToDouble: case SIMDIntrinsicConvertToInt32: case SIMDIntrinsicConvertToInt64: - case SIMDIntrinsicNarrow: case SIMDIntrinsicWidenHi: case SIMDIntrinsicWidenLo: return true; @@ -2359,17 +2358,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, } break; - case SIMDIntrinsicNarrow: - { - assert(!instMethod); - op2 = impSIMDPopStack(simdType); - op1 = impSIMDPopStack(simdType); - // op1 and op2 are two input Vector. - simdTree = gtNewSIMDNode(simdType, op1, op2, simdIntrinsicID, simdBaseJitType, size); - retVal = simdTree; - } - break; - case SIMDIntrinsicWiden: { GenTree* dstAddrHi = impSIMDPopStack(TYP_BYREF); diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index 3aa47a6b50d98..3fd471dbac65e 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -229,7 +229,8 @@ GenTree* Compiler::impSimdAsHWIntrinsic(NamedIntrinsic intrinsic, simdBaseJitType = getBaseJitTypeAndSizeOfSIMDType(clsHnd, &simdSize); } } - else if ((clsHnd == m_simdHandleCache->SIMDVectorHandle) && (numArgs != 0)) + else if ((clsHnd == m_simdHandleCache->SIMDVectorHandle) && (numArgs != 0) && + !SimdAsHWIntrinsicInfo::KeepBaseTypeFromRet(intrinsic)) { // We need to fixup the clsHnd in the case we are an intrinsic on Vector // The first argument will be the appropriate Vector handle to use @@ -915,6 +916,13 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_Narrow: + case NI_VectorT256_Narrow: + { + return gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_op_Multiply: case NI_VectorT256_op_Multiply: { @@ -954,6 +962,12 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_Narrow: + { + return gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + case NI_VectorT128_op_Multiply: { return gtNewSimdBinOpNode(GT_MUL, retType, op1, op2, simdBaseJitType, simdSize, diff --git a/src/coreclr/jit/simdashwintrinsic.h b/src/coreclr/jit/simdashwintrinsic.h index 176507d0b6653..a48729412b954 100644 --- a/src/coreclr/jit/simdashwintrinsic.h +++ b/src/coreclr/jit/simdashwintrinsic.h @@ -29,6 +29,9 @@ enum class SimdAsHWIntrinsicFlag : unsigned int // Base type should come from the this argument BaseTypeFromThisArg = 0x08, + + // For SIMDVectorHandle, keep the base type from the result type + KeepBaseTypeFromRet = 0x10, }; inline SimdAsHWIntrinsicFlag operator~(SimdAsHWIntrinsicFlag value) @@ -133,6 +136,12 @@ struct SimdAsHWIntrinsicInfo SimdAsHWIntrinsicFlag flags = lookupFlags(id); return (flags & SimdAsHWIntrinsicFlag::BaseTypeFromThisArg) == SimdAsHWIntrinsicFlag::BaseTypeFromThisArg; } + + static bool KeepBaseTypeFromRet(NamedIntrinsic id) + { + SimdAsHWIntrinsicFlag flags = lookupFlags(id); + return (flags & SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet) == SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet; + } }; #endif // _SIMD_AS_HWINTRINSIC_H_ diff --git a/src/coreclr/jit/simdashwintrinsiclistarm64.h b/src/coreclr/jit/simdashwintrinsiclistarm64.h index 229222882f720..2c2522e605ed3 100644 --- a/src/coreclr/jit/simdashwintrinsiclistarm64.h +++ b/src/coreclr/jit/simdashwintrinsiclistarm64.h @@ -121,6 +121,7 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, LessThan, SIMD_AS_HWINTRINSIC_ID(VectorT128, LessThanOrEqual, 2, {NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_Arm64_CompareLessThanOrEqual, NI_AdvSimd_Arm64_CompareLessThanOrEqual, NI_AdvSimd_CompareLessThanOrEqual, NI_AdvSimd_Arm64_CompareLessThanOrEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Max, 2, {NI_AdvSimd_Max, NI_AdvSimd_Max, NI_AdvSimd_Max, NI_AdvSimd_Max, NI_AdvSimd_Max, NI_AdvSimd_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_AdvSimd_Max, NI_AdvSimd_Arm64_Max}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Min, 2, {NI_AdvSimd_Min, NI_AdvSimd_Min, NI_AdvSimd_Min, NI_AdvSimd_Min, NI_AdvSimd_Min, NI_AdvSimd_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_AdvSimd_Min, NI_AdvSimd_Arm64_Min}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, Narrow, 2, {NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow}, SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Addition, 2, {NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Add, NI_AdvSimd_Arm64_Add}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_BitwiseAnd, 2, {NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And, NI_AdvSimd_And}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_BitwiseOr, 2, {NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or, NI_AdvSimd_Or}, SimdAsHWIntrinsicFlag::None) diff --git a/src/coreclr/jit/simdashwintrinsiclistxarch.h b/src/coreclr/jit/simdashwintrinsiclistxarch.h index 92d665c2de8a7..355e7c3f5f54f 100644 --- a/src/coreclr/jit/simdashwintrinsiclistxarch.h +++ b/src/coreclr/jit/simdashwintrinsiclistxarch.h @@ -121,6 +121,7 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, LessThan, SIMD_AS_HWINTRINSIC_ID(VectorT128, LessThanOrEqual, 2, {NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_VectorT128_LessThanOrEqual, NI_SSE_CompareLessThanOrEqual, NI_SSE2_CompareLessThanOrEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Max, 2, {NI_VectorT128_Max, NI_SSE2_Max, NI_SSE2_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_VectorT128_Max, NI_SSE_Max, NI_SSE2_Max}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Min, 2, {NI_VectorT128_Min, NI_SSE2_Min, NI_SSE2_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_VectorT128_Min, NI_SSE_Min, NI_SSE2_Min}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, Narrow, 2, {NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow, NI_VectorT128_Narrow}, SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Addition, 2, {NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE2_Add, NI_SSE_Add, NI_SSE2_Add}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_BitwiseAnd, 2, {NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE2_And, NI_SSE_And, NI_SSE2_And}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, op_BitwiseOr, 2, {NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE2_Or, NI_SSE_Or, NI_SSE2_Or}, SimdAsHWIntrinsicFlag::None) @@ -160,6 +161,7 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256, LessThan, SIMD_AS_HWINTRINSIC_ID(VectorT256, LessThanOrEqual, 2, {NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_VectorT256_LessThanOrEqual, NI_AVX_CompareLessThanOrEqual, NI_AVX_CompareLessThanOrEqual}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Max, 2, {NI_AVX2_Max, NI_AVX2_Max, NI_AVX2_Max, NI_AVX2_Max, NI_AVX2_Max, NI_AVX2_Max, NI_VectorT256_Max, NI_VectorT256_Max, NI_AVX_Max, NI_AVX_Max}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Min, 2, {NI_AVX2_Min, NI_AVX2_Min, NI_AVX2_Min, NI_AVX2_Min, NI_AVX2_Min, NI_AVX2_Min, NI_VectorT256_Min, NI_VectorT256_Min, NI_AVX_Min, NI_AVX_Min}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, Narrow, 2, {NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow, NI_VectorT256_Narrow}, SimdAsHWIntrinsicFlag::KeepBaseTypeFromRet) SIMD_AS_HWINTRINSIC_ID(VectorT256, op_Addition, 2, {NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX2_Add, NI_AVX_Add, NI_AVX_Add}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, op_BitwiseAnd, 2, {NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX2_And, NI_AVX_And, NI_AVX_And}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, op_BitwiseOr, 2, {NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX2_Or, NI_AVX_Or, NI_AVX_Or}, SimdAsHWIntrinsicFlag::None) diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index 4523fe48a896e..ea391b992e064 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -241,33 +241,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type result = INS_cvttsd2si; break; - case SIMDIntrinsicNarrow: - // Note that for the integer types the caller must zero the upper bits of - // each source element, since the instructions saturate. - switch (baseType) - { - case TYP_INT: - case TYP_UINT: - if (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported) - { - result = INS_packusdw; - } - else - { - result = INS_packssdw; - } - break; - case TYP_SHORT: - case TYP_USHORT: - result = INS_packuswb; - break; - default: - assert(!"Invalid baseType for SIMDIntrinsicNarrow"); - result = INS_invalid; - break; - } - break; - case SIMDIntrinsicWidenLo: // Some of these have multiple instruction implementations, with one instruction to widen the lo half, // and another to widen the hi half. @@ -1285,153 +1258,6 @@ void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) genProduceReg(simdNode); } -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicNarrow: Generate code for SIMD Intrinsic Narrow operations -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Notes: -// This intrinsic takes two arguments. The first operand is narrowed to produce the -// lower elements of the results, and the second operand produces the high elements. -// -void CodeGen::genSIMDIntrinsicNarrow(GenTreeSIMD* simdNode) -{ - assert(simdNode->gtSIMDIntrinsicID == SIMDIntrinsicNarrow); - - GenTree* op1 = simdNode->gtGetOp1(); - GenTree* op2 = simdNode->gtGetOp2(); - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types simdType = simdNode->TypeGet(); - emitAttr emitSize = emitTypeSize(simdType); - SIMDLevel level = compiler->getSIMDSupportLevel(); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber op2Reg = op2->GetRegNum(); - if (baseType == TYP_DOUBLE) - { - regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - - inst_RV_RV(INS_cvtpd2ps, targetReg, op1Reg, simdType); - inst_RV_RV(INS_cvtpd2ps, tmpReg, op2Reg, simdType); - // Now insert the high-order result (in tmpReg) into the upper half of targetReg. - if (level == SIMD_AVX2_Supported) - { - GetEmitter()->emitIns_R_R_I(INS_vinsertf128, EA_32BYTE, targetReg, tmpReg, 0x01); - } - else - { - inst_RV_RV_IV(INS_shufps, EA_16BYTE, targetReg, tmpReg, (int8_t)SHUFFLE_YXYX); - } - } - else if (varTypeIsLong(baseType)) - { - if (level == SIMD_AVX2_Supported) - { - // We have 8 long elements, 0-3 in op1Reg, 4-7 in op2Reg. - // We will generate the following: - // vextracti128 tmpReg, op1Reg, 1 (extract elements 2 and 3 into tmpReg) - // vextracti128 tmpReg2, op2Reg, 1 (extract elements 6 and 7 into tmpReg2) - // vinserti128 tmpReg, tmpReg2, 1 (insert elements 6 and 7 into the high half of tmpReg) - // mov tmpReg2, op1Reg - // vinserti128 tmpReg2, op2Reg, 1 (insert elements 4 and 5 into the high half of tmpReg2) - // pshufd tmpReg, tmpReg, XXZX ( - - 7L 6L - - 3L 2L) in tmpReg - // pshufd tgtReg, tmpReg2, XXZX ( - - 5L 4L - - 1L 0L) in tgtReg - // punpcklqdq tgtReg, tmpReg - regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); - regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - GetEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg, op1Reg, 0x01); - GetEmitter()->emitIns_R_R_I(INS_vextracti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); - GetEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg, tmpReg2, 0x01); - inst_Mov(simdType, tmpReg2, op1Reg, /* canSkip */ false, emitSize); - GetEmitter()->emitIns_R_R_I(INS_vinserti128, EA_32BYTE, tmpReg2, op2Reg, 0x01); - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, tmpReg, (int8_t)SHUFFLE_XXZX); - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, tmpReg2, (int8_t)SHUFFLE_XXZX); - inst_RV_RV_RV(INS_punpcklqdq, targetReg, targetReg, tmpReg, emitSize); - } - else - { - // We will generate the following: - // pshufd targetReg, op1Reg, ZXXX (extract the low 32-bits into the upper two 32-bit elements) - // psrldq targetReg, 8 (shift them right to get zeros in the high elements) - // pshufd tmpReg, op2Reg, XXZX (same as above, but extract into the lower two 32-bit elements) - // pslldq tmpReg, 8 (now shift these left to get zeros in the low elements) - // por targetReg, tmpReg - regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, TYP_SIMD16); - instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, TYP_SIMD16); - emitAttr emitSize = emitTypeSize(simdType); - - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, targetReg, op1Reg, (int8_t)SHUFFLE_ZXXX); - GetEmitter()->emitIns_R_I(shiftRightIns, emitSize, targetReg, 8); - GetEmitter()->emitIns_R_R_I(INS_pshufd, emitSize, tmpReg, op2Reg, (int8_t)SHUFFLE_XXZX); - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, 8); - inst_RV_RV(INS_por, targetReg, tmpReg, simdType); - } - } - else - { - // We will generate the following: - // mov targetReg, op1Reg - // mov tmpReg, op2Reg - // psll? targetReg, shiftCount - // pslr? targetReg, shiftCount - // psll? tmpReg, shiftCount - // pslr? tmpReg, shiftCount - // targetReg, tmpReg - // Where shiftCount is the size of the target baseType (i.e. half the size of the source baseType), - // and is the appropriate instruction to pack the result (note that we have to truncate to - // get CLR type semantics; otherwise it will saturate). - // - int shiftCount = genTypeSize(baseType) * (BITS_IN_BYTE / 2); - instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - instruction shiftLeftIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType); - instruction shiftRightIns = getOpForSIMDIntrinsic(SIMDIntrinsicShiftRightInternal, baseType); - - assert((shiftCount >= 0) && (shiftCount <= 127)); - - if (level == SIMD_AVX2_Supported) - { - regNumber tmpReg = simdNode->ExtractTempReg(RBM_ALLFLOAT); - regNumber tmpReg2 = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - - // The AVX instructions generally operate on "lanes", so we have to permute the - // inputs so that the destination register has the low 128-bit halves of the two - // inputs, and 'tmpReg' has the high 128-bit halves of the two inputs. - GetEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg2, op1Reg, op2Reg, 0x20); - GetEmitter()->emitIns_R_R_R_I(INS_vperm2i128, emitSize, tmpReg, op1Reg, op2Reg, 0x31); - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg2, shiftCount); - GetEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg2, shiftCount); - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); - GetEmitter()->emitIns_R_I(shiftRightIns, emitSize, tmpReg, shiftCount); - inst_RV_RV_RV(ins, targetReg, tmpReg2, tmpReg, emitActualTypeSize(simdType)); - } - else - { - regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - - inst_Mov(simdType, targetReg, op1Reg, /* canSkip */ false, emitSize); - inst_Mov(simdType, tmpReg, op2Reg, /* canSkip */ false, emitSize); - - instruction tmpShiftRight = shiftRightIns; - if ((baseType == TYP_INT || baseType == TYP_UINT) && level == SIMD_SSE2_Supported) - { - tmpShiftRight = INS_psrad; - } - - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, targetReg, shiftCount); - GetEmitter()->emitIns_R_I(tmpShiftRight, emitSize, targetReg, shiftCount); - GetEmitter()->emitIns_R_I(shiftLeftIns, emitSize, tmpReg, shiftCount); - GetEmitter()->emitIns_R_I(tmpShiftRight, emitSize, tmpReg, shiftCount); - inst_RV_RV(ins, targetReg, tmpReg, simdType); - } - } - genProduceReg(simdNode); -} - //-------------------------------------------------------------------------------- // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations // add, sub, mul, bit-wise And, AndNot and Or. @@ -1960,10 +1786,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicWiden(simdNode); break; - case SIMDIntrinsicNarrow: - genSIMDIntrinsicNarrow(simdNode); - break; - case SIMDIntrinsicSub: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: diff --git a/src/coreclr/jit/simdintrinsiclist.h b/src/coreclr/jit/simdintrinsiclist.h index 258fecfdd6578..1624382dfb528 100644 --- a/src/coreclr/jit/simdintrinsiclist.h +++ b/src/coreclr/jit/simdintrinsiclist.h @@ -78,8 +78,6 @@ SIMD_INTRINSIC("ConvertToDouble", false, ConvertToDouble, SIMD_INTRINSIC("ConvertToInt32", false, ConvertToInt32, "ConvertToInt32", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) // Convert double to long SIMD_INTRINSIC("ConvertToInt64", false, ConvertToInt64, "ConvertToInt64", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -// Narrow two input Vectors to a single Vector. The return value's lower elements are the elements from src1, and the upper elements are from src2. -SIMD_INTRINSIC("Narrow", false, Narrow, "Narrow", TYP_STRUCT, 2, {TYP_STRUCT, TYP_STRUCT, TYP_UNDEF}, {TYP_INT, TYP_DOUBLE, TYP_LONG, TYP_USHORT, TYP_SHORT, TYP_UINT, TYP_ULONG, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) // Widen one input Vector to two Vectors: dest1 contains the lower half of elements in src, and dest2 contains the upper half of elements in src. SIMD_INTRINSIC("Widen", false, Widen, "Widen", TYP_VOID, 3, {TYP_STRUCT, TYP_BYREF, TYP_BYREF}, {TYP_INT, TYP_FLOAT, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) From df0a3dd44930e270857b8ee3073016297260184e Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 6 Oct 2021 14:30:44 -0700 Subject: [PATCH 2/5] Moving Widen to implemented using SIMDAsHWIntrinsic --- src/coreclr/jit/codegen.h | 1 - src/coreclr/jit/codegenarm64.cpp | 52 --- src/coreclr/jit/compiler.h | 12 + src/coreclr/jit/gentree.cpp | 290 ++++++++++++++- src/coreclr/jit/hwintrinsicarm64.cpp | 22 ++ src/coreclr/jit/hwintrinsiclistarm64.h | 4 + src/coreclr/jit/hwintrinsiclistxarch.h | 4 + src/coreclr/jit/hwintrinsicxarch.cpp | 30 ++ src/coreclr/jit/lsraarm64.cpp | 3 - src/coreclr/jit/lsraxarch.cpp | 10 - src/coreclr/jit/simd.cpp | 35 -- src/coreclr/jit/simdashwintrinsic.cpp | 29 ++ src/coreclr/jit/simdashwintrinsiclistarm64.h | 2 + src/coreclr/jit/simdashwintrinsiclistxarch.h | 4 + src/coreclr/jit/simdcodegenxarch.cpp | 150 -------- src/coreclr/jit/simdintrinsiclist.h | 6 - src/coreclr/jit/valuenum.cpp | 4 - .../System/Runtime/Intrinsics/Vector128.cs | 350 +++++++++++------- .../System/Runtime/Intrinsics/Vector256.cs | 350 +++++++++++------- .../src/System/Runtime/Intrinsics/Vector64.cs | 308 +++++++++------ 20 files changed, 1025 insertions(+), 641 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 3c4c1da158196..505aae2f6f5f3 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -968,7 +968,6 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genSIMDIntrinsic32BitConvert(GenTreeSIMD* simdNode); void genSIMDIntrinsic64BitConvert(GenTreeSIMD* simdNode); void genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, regNumber tgtReg); - void genSIMDIntrinsicWiden(GenTreeSIMD* simdNode); void genSIMDIntrinsic(GenTreeSIMD* simdNode); // TYP_SIMD12 (i.e Vector3 of size 12 bytes) is not a hardware supported size and requires diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index f4fb89fe35000..fa14fbc4162da 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3894,11 +3894,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsicUnOp(simdNode); break; - case SIMDIntrinsicWidenLo: - case SIMDIntrinsicWidenHi: - genSIMDIntrinsicWiden(simdNode); - break; - case SIMDIntrinsicSub: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: @@ -3990,12 +3985,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type case SIMDIntrinsicSub: result = INS_fsub; break; - case SIMDIntrinsicWidenLo: - result = INS_fcvtl; - break; - case SIMDIntrinsicWidenHi: - result = INS_fcvtl2; - break; default: assert(!"Unsupported SIMD intrinsic"); unreached(); @@ -4026,12 +4015,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type case SIMDIntrinsicSub: result = INS_sub; break; - case SIMDIntrinsicWidenLo: - result = isUnsigned ? INS_uxtl : INS_sxtl; - break; - case SIMDIntrinsicWidenHi: - result = isUnsigned ? INS_uxtl2 : INS_sxtl2; - break; default: assert(!"Unsupported SIMD intrinsic"); unreached(); @@ -4210,41 +4193,6 @@ void CodeGen::genSIMDIntrinsicUnOp(GenTreeSIMD* simdNode) genProduceReg(simdNode); } -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Notes: -// The Widen intrinsics are broken into separate intrinsics for the two results. -// -void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) -{ - assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) || - (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)); - - GenTree* op1 = simdNode->gtGetOp1(); - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types simdType = simdNode->TypeGet(); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber srcReg = op1Reg; - emitAttr emitSize = emitActualTypeSize(simdType); - - instruction ins = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - - emitAttr attr = (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) ? EA_16BYTE : EA_8BYTE; - insOpts opt = genGetSimdInsOpt(attr, baseType); - - GetEmitter()->emitIns_R_R(ins, attr, targetReg, op1Reg, opt); - - genProduceReg(simdNode); -} - //-------------------------------------------------------------------------------- // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations // add, sub, mul, bit-wise And, AndNot and Or. diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 63511d1c723d5..623119d2ea08a 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3279,6 +3279,18 @@ class Compiler unsigned simdSize, bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdWidenLowerNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + + GenTree* gtNewSimdWidenUpperNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdWithElementNode(var_types type, GenTree* op1, GenTree* op2, diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 49e9c22dd9788..3bc9490498e5d 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20932,8 +20932,6 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); assert(varTypeIsArithmetic(simdBaseType)); - NamedIntrinsic intrinsic = NI_Illegal; - GenTree* tmp1; GenTree* tmp2; @@ -21457,6 +21455,294 @@ GenTree* Compiler::gtNewSimdUnOpNode(genTreeOps op, } } +GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic) +{ + assert(IsBaselineSimdIsaSupportedDebugOnly()); + + assert(varTypeIsSIMD(type)); + assert(getSIMDTypeForSize(simdSize) == type); + + assert(op1 != nullptr); + assert(op1->TypeIs(type)); + + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType)); + + NamedIntrinsic intrinsic = NI_Illegal; + + GenTree* tmp1; + +#if defined(TARGET_XARCH) + if (simdSize == 32) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = NI_AVX2_ConvertToVector256Int16; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + intrinsic = NI_AVX2_ConvertToVector256Int32; + break; + } + + case TYP_INT: + case TYP_UINT: + { + intrinsic = NI_AVX2_ConvertToVector256Int64; + break; + } + + case TYP_FLOAT: + { + intrinsic = NI_AVX_ConvertToVector256Double; + break; + } + + default: + { + unreached(); + } + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else if ((simdBaseType == TYP_FLOAT) || compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = NI_SSE41_ConvertToVector128Int16; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + intrinsic = NI_SSE41_ConvertToVector128Int32; + break; + } + + case TYP_INT: + case TYP_UINT: + { + intrinsic = NI_SSE41_ConvertToVector128Int64; + break; + } + + case TYP_FLOAT: + { + intrinsic = NI_SSE2_ConvertToVector128Double; + break; + } + + default: + { + unreached(); + } + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else + { + tmp1 = gtNewSimdZeroNode(type, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + if (varTypeIsSigned(simdBaseType)) + { + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_CompareLessThan, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } +#elif defined(TARGET_ARM64) + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + if (varTypeIsFloating(simdBaseType)) + { + intrinsic = NI_AdvSimd_Arm64_ConvertToDouble; + } + else if (varTypeIsSigned(simdBaseType)) + { + intrinsic = NI_AdvSimd_SignExtendWideningLower; + } + else + { + intrinsic = NI_AdvSimd_ZeroExtendWideningLower; + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 +} + +GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isSimdAsHWIntrinsic) +{ + assert(IsBaselineSimdIsaSupportedDebugOnly()); + + assert(varTypeIsSIMD(type)); + assert(getSIMDTypeForSize(simdSize) == type); + + assert(op1 != nullptr); + assert(op1->TypeIs(type)); + + var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); + assert(varTypeIsArithmetic(simdBaseType)); + + NamedIntrinsic intrinsic = NI_Illegal; + +#if defined(TARGET_XARCH) + GenTree* tmp1; + + if (simdSize == 32) + { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(1), NI_AVX_ExtractVector128, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = NI_AVX2_ConvertToVector256Int16; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + intrinsic = NI_AVX2_ConvertToVector256Int32; + break; + } + + case TYP_INT: + case TYP_UINT: + { + intrinsic = NI_AVX2_ConvertToVector256Int64; + break; + } + + case TYP_FLOAT: + { + intrinsic = NI_AVX_ConvertToVector256Double; + break; + } + + default: + { + unreached(); + } + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else if (varTypeIsFloating(simdBaseType)) + { + assert(simdBaseType == TYP_FLOAT); + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector widen upper")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op1Dup, NI_SSE_MoveHighToLow, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, NI_SSE2_ConvertToVector128Double, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(8), NI_SSE2_ShiftRightLogical128BitLane, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + intrinsic = NI_SSE41_ConvertToVector128Int16; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + intrinsic = NI_SSE41_ConvertToVector128Int32; + break; + } + + case TYP_INT: + case TYP_UINT: + { + intrinsic = NI_SSE41_ConvertToVector128Int64; + break; + } + + default: + { + unreached(); + } + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + else + { + tmp1 = gtNewSimdZeroNode(type, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + + if (varTypeIsSigned(simdBaseType)) + { + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_CompareLessThan, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } + + return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_UnpackHigh, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + } +#elif defined(TARGET_ARM64) + if (varTypeIsFloating(simdBaseType)) + { + intrinsic = NI_AdvSimd_Arm64_ConvertToDoubleUpper; + } + else if (varTypeIsSigned(simdBaseType)) + { + intrinsic = NI_AdvSimd_SignExtendWideningUpper; + } + else + { + intrinsic = NI_AdvSimd_ZeroExtendWideningUpper; + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); +#else +#error Unsupported platform +#endif // !TARGET_XARCH && !TARGET_ARM64 +} + GenTree* Compiler::gtNewSimdWithElementNode(var_types type, GenTree* op1, GenTree* op2, diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index e49fc2c162be2..3fcf4a6528192 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -903,6 +903,28 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector64_WidenLower: + case NI_Vector128_WidenLower: + { + assert(sig->numArgs == 1); + + op1 = impSIMDPopStack(retType); + + retNode = gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + break; + } + + case NI_Vector64_WidenUpper: + case NI_Vector128_WidenUpper: + { + assert(sig->numArgs == 2); + + op1 = impSIMDPopStack(retType); + + retNode = gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + break; + } + case NI_Vector64_WithElement: case NI_Vector128_WithElement: { diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index ba185f3afeb91..97bdd3bb0004a 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -87,6 +87,8 @@ HARDWARE_INTRINSIC(Vector64, Sqrt, HARDWARE_INTRINSIC(Vector64, ToScalar, 8, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector64, ToVector128, 8, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector64, ToVector128Unsafe, 8, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector64, WidenLower, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector64, WidenUpper, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector64, WithElement, 8, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, Xor, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -171,6 +173,8 @@ HARDWARE_INTRINSIC(Vector128, op_UnaryPlus, HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, WidenUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, WithElement, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, Xor, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 00a402260fc42..7179e9854b382 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -104,6 +104,8 @@ HARDWARE_INTRINSIC(Vector128, Sqrt, HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256Unsafe, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector128, WidenUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, WithElement, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, Xor, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -185,6 +187,8 @@ HARDWARE_INTRINSIC(Vector256, op_UnaryPlus, HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Sqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector256, WidenUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 0ee97e80f0b9f..c0ed9a4de197f 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1424,6 +1424,36 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector128_WidenLower: + case NI_Vector256_WidenLower: + { + assert(sig->numArgs == 1); + + if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2)) + { + op1 = impSIMDPopStack(retType); + + retNode = + gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + } + break; + } + + case NI_Vector128_WidenUpper: + case NI_Vector256_WidenUpper: + { + assert(sig->numArgs == 1); + + if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2)) + { + op1 = impSIMDPopStack(retType); + + retNode = + gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + } + break; + } + case NI_Vector128_WithElement: case NI_Vector256_WithElement: { diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 8a1173ad600d8..fbcddacf4269a 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -804,8 +804,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicConvertToInt32: case SIMDIntrinsicConvertToDouble: case SIMDIntrinsicConvertToInt64: - case SIMDIntrinsicWidenLo: - case SIMDIntrinsicWidenHi: // No special handling required. break; @@ -852,7 +850,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicCopyToArrayX: case SIMDIntrinsicNone: case SIMDIntrinsicHWAccel: - case SIMDIntrinsicWiden: case SIMDIntrinsicInvalid: assert(!"These intrinsics should not be seen during register allocation"); FALLTHROUGH; diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index d3eec424a91ae..9b23c3a57d223 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1975,16 +1975,6 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree) case SIMDIntrinsicConvertToInt32: break; - case SIMDIntrinsicWidenLo: - case SIMDIntrinsicWidenHi: - if (varTypeIsIntegral(simdTree->GetSimdBaseType())) - { - // We need an internal register different from targetReg. - setInternalRegsDelayFree = true; - buildInternalFloatRegisterDefForNode(simdTree); - } - break; - case SIMDIntrinsicConvertToInt64: // We need an internal register different from targetReg. setInternalRegsDelayFree = true; diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index 9ba93aa3dad25..d60fb49a5d9b0 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -1204,8 +1204,6 @@ const SIMDIntrinsicInfo* Compiler::getSIMDIntrinsicInfo(CORINFO_CLASS_HANDLE* in case SIMDIntrinsicConvertToDouble: case SIMDIntrinsicConvertToInt32: case SIMDIntrinsicConvertToInt64: - case SIMDIntrinsicWidenHi: - case SIMDIntrinsicWidenLo: return true; default: @@ -2358,39 +2356,6 @@ GenTree* Compiler::impSIMDIntrinsic(OPCODE opcode, } break; - case SIMDIntrinsicWiden: - { - GenTree* dstAddrHi = impSIMDPopStack(TYP_BYREF); - GenTree* dstAddrLo = impSIMDPopStack(TYP_BYREF); - op1 = impSIMDPopStack(simdType); - // op1 must have a valid class handle; the following method will assert it. - CORINFO_CLASS_HANDLE op1Handle = gtGetStructHandle(op1); - GenTree* dupOp1 = fgInsertCommaFormTemp(&op1, op1Handle); - - // Widen the lower half and assign it to dstAddrLo. - simdTree = gtNewSIMDNode(simdType, op1, nullptr, SIMDIntrinsicWidenLo, simdBaseJitType, size); - // TODO-1stClassStructs: With the introduction of ClassLayout it would be preferrable to use - // GT_OBJ instead of GT_BLK nodes to avoid losing information about the actual vector type. - GenTree* loDest = new (this, GT_BLK) - GenTreeBlk(GT_BLK, simdType, dstAddrLo, typGetBlkLayout(getSIMDTypeSizeInBytes(clsHnd))); - GenTree* loAsg = gtNewBlkOpNode(loDest, simdTree, - false, // not volatile - true); // copyBlock - loAsg->gtFlags |= ((simdTree->gtFlags | dstAddrLo->gtFlags) & GTF_ALL_EFFECT); - - // Widen the upper half and assign it to dstAddrHi. - simdTree = gtNewSIMDNode(simdType, dupOp1, nullptr, SIMDIntrinsicWidenHi, simdBaseJitType, size); - GenTree* hiDest = new (this, GT_BLK) - GenTreeBlk(GT_BLK, simdType, dstAddrHi, typGetBlkLayout(getSIMDTypeSizeInBytes(clsHnd))); - GenTree* hiAsg = gtNewBlkOpNode(hiDest, simdTree, - false, // not volatile - true); // copyBlock - hiAsg->gtFlags |= ((simdTree->gtFlags | dstAddrHi->gtFlags) & GTF_ALL_EFFECT); - - retVal = gtNewOperNode(GT_COMMA, simdType, loAsg, hiAsg); - } - break; - case SIMDIntrinsicHWAccel: { GenTreeIntCon* intConstTree = new (this, GT_CNS_INT) GenTreeIntCon(TYP_INT, 1); diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index 3fd471dbac65e..2ccffaea8bdca 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -675,6 +675,7 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, { return gtNewSimdAbsNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_Sum: { @@ -694,6 +695,7 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return gtNewSimdAsHWIntrinsicNode(retType, op1, NI_Vector128_ToScalar, simdBaseJitType, simdSize); } + case NI_VectorT256_Sum: { // HorizontalAdd combines pairs so we need log2(vectorLength) passes to sum all elements together. @@ -731,11 +733,26 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, return gtNewSimdAsHWIntrinsicNode(retType, op1, NI_Vector128_ToScalar, simdBaseJitType, 16); } + + case NI_VectorT128_WidenLower: + case NI_VectorT256_WidenLower: + { + return gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + + case NI_VectorT128_WidenUpper: + case NI_VectorT256_WidenUpper: + { + return gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } #elif defined(TARGET_ARM64) case NI_VectorT128_Abs: { return gtNewSimdAbsNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ true); } + case NI_VectorT128_Sum: { GenTree* tmp; @@ -783,6 +800,18 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, } } } + + case NI_VectorT128_WidenLower: + { + return gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } + + case NI_VectorT128_WidenUpper: + { + return gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ true); + } #else #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 diff --git a/src/coreclr/jit/simdashwintrinsiclistarm64.h b/src/coreclr/jit/simdashwintrinsiclistarm64.h index 2c2522e605ed3..2810a0e6ecfb1 100644 --- a/src/coreclr/jit/simdashwintrinsiclistarm64.h +++ b/src/coreclr/jit/simdashwintrinsiclistarm64.h @@ -134,6 +134,8 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Multiply, SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Subtraction, 2, {NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Subtract, NI_AdvSimd_Arm64_Subtract}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, SquareRoot, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AdvSimd_Arm64_Sqrt, NI_AdvSimd_Arm64_Sqrt}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Sum, 1, {NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, WidenLower, 1, {NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, WidenUpper, 1, {NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper}, SimdAsHWIntrinsicFlag::None) #undef SIMD_AS_HWINTRINSIC_NM #undef SIMD_AS_HWINTRINSIC_ID diff --git a/src/coreclr/jit/simdashwintrinsiclistxarch.h b/src/coreclr/jit/simdashwintrinsiclistxarch.h index 355e7c3f5f54f..08cf517828283 100644 --- a/src/coreclr/jit/simdashwintrinsiclistxarch.h +++ b/src/coreclr/jit/simdashwintrinsiclistxarch.h @@ -134,6 +134,8 @@ SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Multiply, SIMD_AS_HWINTRINSIC_ID(VectorT128, op_Subtraction, 2, {NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE2_Subtract, NI_SSE_Subtract, NI_SSE2_Subtract}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, SquareRoot, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_SSE_Sqrt, NI_SSE2_Sqrt}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT128, Sum, 1, {NI_Illegal, NI_Illegal, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_VectorT128_Sum, NI_Illegal, NI_Illegal, NI_VectorT128_Sum, NI_VectorT128_Sum}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, WidenLower, 1, {NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower, NI_VectorT128_WidenLower}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT128, WidenUpper, 1, {NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper, NI_VectorT128_WidenUpper}, SimdAsHWIntrinsicFlag::None) // ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* // ISA ID Name NumArg Instructions Flags @@ -174,6 +176,8 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256, op_Multiply, SIMD_AS_HWINTRINSIC_ID(VectorT256, op_Subtraction, 2, {NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX2_Subtract, NI_AVX_Subtract, NI_AVX_Subtract}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, SquareRoot, 1, {NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_Illegal, NI_AVX_Sqrt, NI_AVX_Sqrt}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Sum, 1, {NI_Illegal, NI_Illegal, NI_VectorT256_Sum, NI_VectorT256_Sum, NI_VectorT256_Sum, NI_VectorT256_Sum, NI_Illegal, NI_Illegal, NI_VectorT256_Sum, NI_VectorT256_Sum}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, WidenLower, 1, {NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower, NI_VectorT256_WidenLower}, SimdAsHWIntrinsicFlag::None) +SIMD_AS_HWINTRINSIC_ID(VectorT256, WidenUpper, 1, {NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper, NI_VectorT256_WidenUpper}, SimdAsHWIntrinsicFlag::None) #undef SIMD_AS_HWINTRINSIC_NM #undef SIMD_AS_HWINTRINSIC_ID diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index ea391b992e064..5a3f0296d1813 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -241,59 +241,6 @@ instruction CodeGen::getOpForSIMDIntrinsic(SIMDIntrinsicID intrinsicId, var_type result = INS_cvttsd2si; break; - case SIMDIntrinsicWidenLo: - // Some of these have multiple instruction implementations, with one instruction to widen the lo half, - // and another to widen the hi half. - switch (baseType) - { - case TYP_FLOAT: - result = INS_cvtps2pd; - break; - case TYP_INT: - case TYP_UINT: - result = INS_punpckldq; - break; - case TYP_SHORT: - case TYP_USHORT: - result = INS_punpcklwd; - break; - case TYP_BYTE: - case TYP_UBYTE: - result = INS_punpcklbw; - break; - default: - assert(!"Invalid baseType for SIMDIntrinsicWidenLo"); - result = INS_invalid; - break; - } - break; - - case SIMDIntrinsicWidenHi: - switch (baseType) - { - case TYP_FLOAT: - // For this case, we actually use the same instruction. - result = INS_cvtps2pd; - break; - case TYP_INT: - case TYP_UINT: - result = INS_punpckhdq; - break; - case TYP_SHORT: - case TYP_USHORT: - result = INS_punpckhwd; - break; - case TYP_BYTE: - case TYP_UBYTE: - result = INS_punpckhbw; - break; - default: - assert(!"Invalid baseType for SIMDIntrinsicWidenHi"); - result = INS_invalid; - break; - } - break; - case SIMDIntrinsicShiftLeftInternal: switch (baseType) { @@ -1166,98 +1113,6 @@ void CodeGen::genSIMDExtractUpperHalf(GenTreeSIMD* simdNode, regNumber srcReg, r } } -//-------------------------------------------------------------------------------- -// genSIMDIntrinsicWiden: Generate code for SIMD Intrinsic Widen operations -// -// Arguments: -// simdNode - The GT_SIMD node -// -// Notes: -// The Widen intrinsics are broken into separate intrinsics for the two results. -// -void CodeGen::genSIMDIntrinsicWiden(GenTreeSIMD* simdNode) -{ - assert((simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenLo) || - (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi)); - - GenTree* op1 = simdNode->gtGetOp1(); - var_types baseType = simdNode->GetSimdBaseType(); - regNumber targetReg = simdNode->GetRegNum(); - assert(targetReg != REG_NA); - var_types simdType = simdNode->TypeGet(); - SIMDLevel level = compiler->getSIMDSupportLevel(); - - genConsumeOperands(simdNode); - regNumber op1Reg = op1->GetRegNum(); - regNumber srcReg = op1Reg; - emitAttr emitSize = emitActualTypeSize(simdType); - instruction widenIns = getOpForSIMDIntrinsic(simdNode->gtSIMDIntrinsicID, baseType); - - if (baseType == TYP_FLOAT) - { - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) - { - genSIMDExtractUpperHalf(simdNode, srcReg, targetReg); - srcReg = targetReg; - } - inst_RV_RV(widenIns, targetReg, srcReg, simdType); - } - else - { - // We will generate the following on AVX: - // vpermq targetReg, op1Reg, 0xd4|0xe8 - // vpxor tmpReg, tmpReg - // vpcmpgt[b|w|d] tmpReg, targetReg (if basetype is signed) - // vpunpck[l|h][bw|wd|dq] targetReg, tmpReg - regNumber tmpReg = simdNode->GetSingleTempReg(RBM_ALLFLOAT); - assert(tmpReg != op1Reg); - - if (level == SIMD_AVX2_Supported) - { - // permute op1Reg and put it into targetReg - unsigned ival = 0xd4; - if (simdNode->gtSIMDIntrinsicID == SIMDIntrinsicWidenHi) - { - ival = 0xe8; - } - assert((ival >= 0) && (ival <= 255)); - GetEmitter()->emitIns_R_R_I(INS_vpermq, emitSize, targetReg, op1Reg, (int8_t)ival); - } - else - { - inst_Mov(simdType, targetReg, op1Reg, /* canSkip */ true); - } - - genSIMDZero(simdType, baseType, tmpReg); - if (!varTypeIsUnsigned(baseType)) - { - instruction compareIns = INS_invalid; - - if (baseType == TYP_INT) - { - compareIns = INS_pcmpgtd; - } - else if (baseType == TYP_SHORT) - { - compareIns = INS_pcmpgtw; - } - else if (baseType == TYP_BYTE) - { - compareIns = INS_pcmpgtb; - } - else if ((baseType == TYP_LONG) && (compiler->getSIMDSupportLevel() >= SIMD_SSE4_Supported)) - { - compareIns = INS_pcmpgtq; - } - - assert(compareIns != INS_invalid); - inst_RV_RV(compareIns, tmpReg, targetReg, simdType, emitSize); - } - inst_RV_RV(widenIns, targetReg, tmpReg, simdType); - } - genProduceReg(simdNode); -} - //-------------------------------------------------------------------------------- // genSIMDIntrinsicBinOp: Generate code for SIMD Intrinsic binary operations // add, sub, mul, bit-wise And, AndNot and Or. @@ -1781,11 +1636,6 @@ void CodeGen::genSIMDIntrinsic(GenTreeSIMD* simdNode) genSIMDIntrinsic64BitConvert(simdNode); break; - case SIMDIntrinsicWidenLo: - case SIMDIntrinsicWidenHi: - genSIMDIntrinsicWiden(simdNode); - break; - case SIMDIntrinsicSub: case SIMDIntrinsicBitwiseAnd: case SIMDIntrinsicBitwiseOr: diff --git a/src/coreclr/jit/simdintrinsiclist.h b/src/coreclr/jit/simdintrinsiclist.h index 1624382dfb528..0b354b533702c 100644 --- a/src/coreclr/jit/simdintrinsiclist.h +++ b/src/coreclr/jit/simdintrinsiclist.h @@ -78,8 +78,6 @@ SIMD_INTRINSIC("ConvertToDouble", false, ConvertToDouble, SIMD_INTRINSIC("ConvertToInt32", false, ConvertToInt32, "ConvertToInt32", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_FLOAT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) // Convert double to long SIMD_INTRINSIC("ConvertToInt64", false, ConvertToInt64, "ConvertToInt64", TYP_STRUCT, 1, {TYP_STRUCT, TYP_UNDEF, TYP_UNDEF}, {TYP_DOUBLE, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -// Widen one input Vector to two Vectors: dest1 contains the lower half of elements in src, and dest2 contains the upper half of elements in src. -SIMD_INTRINSIC("Widen", false, Widen, "Widen", TYP_VOID, 3, {TYP_STRUCT, TYP_BYREF, TYP_BYREF}, {TYP_INT, TYP_FLOAT, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) // Miscellaneous SIMD_INTRINSIC("get_IsHardwareAccelerated", false, HWAccel, "HWAccel", TYP_BOOL, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) @@ -98,10 +96,6 @@ SIMD_INTRINSIC("ShiftRightInternal", false, ShiftRightInternal, SIMD_INTRINSIC("UpperSave", false, UpperSave, "UpperSave Internal", TYP_STRUCT, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) SIMD_INTRINSIC("UpperRestore", false, UpperRestore, "UpperRestore Internal", TYP_STRUCT, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -// Internal intrinsics for Widen -SIMD_INTRINSIC("WidenHi", false, WidenHi, "WidenHi", TYP_VOID, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) -SIMD_INTRINSIC("WidenLo", false, WidenLo, "WidenLo", TYP_VOID, 2, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_INT, TYP_FLOAT, TYP_USHORT, TYP_UBYTE, TYP_BYTE, TYP_SHORT, TYP_UINT, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) - SIMD_INTRINSIC(nullptr, false, Invalid, "Invalid", TYP_UNDEF, 0, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}, {TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF, TYP_UNDEF}) #undef SIMD_INTRINSIC #else // !defined(TARGET_XARCH) && !defined(TARGET_ARM64) diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 6e31f385d5558..db8ff087b93c5 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -6186,10 +6186,6 @@ void ValueNumStore::InitValueNumStoreStatics() // SIMDIntrinsicInit has an entry of 2 for numArgs, but it only has one normal arg ValueNumFuncSetArity(VNF_SIMD_Init, 1); - // SIMDIntrinsicWidenHi has an entry of 2 for numArgs, but it only has one normal arg - ValueNumFuncSetArity(VNF_SIMD_WidenHi, 1); - // SIMDIntrinsicWidenLo has an entry of 2 for numArgs, but it only has one normal arg - ValueNumFuncSetArity(VNF_SIMD_WidenLo, 1); // Some SIMD intrinsic nodes have an extra VNF_SimdType arg // diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index f2c2784969b3b..f96e13199ff0b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -2690,173 +2690,47 @@ public static bool TryCopyTo(this Vector128 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector128 Lower, Vector128 Upper) Widen(Vector128 source) - { - Unsafe.SkipInit(out Vector128 lower); - Unsafe.SkipInit(out Vector128 upper); - - for (int i = 0; i < Vector128.Count; i++) - { - var value = (ulong)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector128.Count; i < Vector128.Count; i++) - { - var value = (ulong)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector128.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Creates a new with the element at the specified index set to the specified value and the remaining elements set to the same value as that in the given vector. /// The type of the input vector. @@ -2962,5 +2836,215 @@ internal static void SetElementUnsafe(in this Vector128 vector, int index, Debug.Assert((index >= 0) && (index < Vector128.Count)); Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (ushort)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (int)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenLower(Vector128 source) + { + Unsafe.SkipInit(out Vector128 lower); + + for (int i = 0; i < Vector128.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (ushort)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (int)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector128 WidenUpper(Vector128 source) + { + Unsafe.SkipInit(out Vector128 upper); + + for (int i = Vector128.Count; i < Vector128.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector128.Count, value); + } + + return upper; + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 72ab4796336f5..f2fc2972e75b7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -2788,173 +2788,47 @@ public static bool TryCopyTo(this Vector256 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Widens a into two . /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector256 Lower, Vector256 Upper) Widen(Vector256 source) - { - Unsafe.SkipInit(out Vector256 lower); - Unsafe.SkipInit(out Vector256 upper); - - for (int i = 0; i < Vector256.Count; i++) - { - var value = (ulong)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); - } - - for (int i = Vector256.Count; i < Vector256.Count; i++) - { - var value = (ulong)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector256.Count, value); - } - - return (lower, upper); - } + => (WidenLower(source), WidenUpper(source)); /// Creates a new with the element at the specified index set to the specified value and the remaining elements set to the same value as that in the given vector. /// The type of the input vector. @@ -3076,5 +2950,215 @@ internal static void SetElementUnsafe(in this Vector256 vector, int index, Debug.Assert((index >= 0) && (index < Vector256.Count)); Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (ushort)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (int)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenLower(Vector256 source) + { + Unsafe.SkipInit(out Vector256 lower); + + for (int i = 0; i < Vector256.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); + } + + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (ushort)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (int)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector256 WidenUpper(Vector256 source) + { + Unsafe.SkipInit(out Vector256 upper); + + for (int i = Vector256.Count; i < Vector256.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector256.Count, value); + } + + return upper; + } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs index dbfea1c89883e..aca9c0f2d886a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs @@ -1994,35 +1994,118 @@ public static bool TryCopyTo(this Vector64 vector, Span destination) /// The vector whose elements are to be widened. /// A pair of vectors that contain the widened lower and upper halves of . [CLSCompliant(false)] - [Intrinsic] public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + [CLSCompliant(false)] + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + [CLSCompliant(false)] + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A pair of vectors that contain the widened lower and upper halves of . + [CLSCompliant(false)] + public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + => (WidenLower(source), WidenUpper(source)); + + /// Creates a new with the element at the specified index set to the specified value and the remaining elements set to the same value as that in the given vector. + /// The type of the input vector. + /// The vector to get the remaining elements from. + /// The index of the element to set. + /// The value to set the element to. + /// A with the value of the element at set to and the remaining elements set to the same value as that in . + /// The type of () is not supported. + /// was less than zero or greater than the number of elements. + [Intrinsic] + public static Vector64 WithElement(this Vector64 vector, int index, T value) + where T : struct { - Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); + ThrowHelper.ThrowForUnsupportedIntrinsicsVector64BaseType(); - for (int i = 0; i < Vector64.Count; i++) + if ((uint)(index) >= (uint)(Vector64.Count)) { - var value = (ushort)source.GetElementUnsafe(i); - lower.SetElementUnsafe(i, value); + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index); } - for (int i = Vector64.Count; i < Vector64.Count; i++) + Vector64 result = vector; + result.SetElementUnsafe(index, value); + return result; + } + + /// Computes the exclusive-or of two vectors. + /// The vector to exclusive-or with . + /// The vector to exclusive-or with . + /// The type of the elements in the vector. + /// The exclusive-or of and . + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector64 Xor(Vector64 left, Vector64 right) + where T : struct => left ^ right; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static T GetElementUnsafe(in this Vector64 vector, int index) + where T : struct + { + Debug.Assert((index >= 0) && (index < Vector64.Count)); + return Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void SetElementUnsafe(in this Vector64 vector, int index, T value) + where T : struct + { + Debug.Assert((index >= 0) && (index < Vector64.Count)); + Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector64 WidenLower(Vector64 source) + { + Unsafe.SkipInit(out Vector64 lower); + + for (int i = 0; i < Vector64.Count; i++) { var value = (ushort)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); + lower.SetElementUnsafe(i, value); } - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2030,23 +2113,14 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64.Count; i < Vector64.Count; i++) - { - var value = (int)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2054,24 +2128,14 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2079,23 +2143,14 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) - { - var value = (short)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2103,24 +2158,14 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vect lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) - { - var value = (double)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2128,24 +2173,14 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) - { - var value = (uint)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); - } - - return (lower, upper); + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A pair of vectors that contain the widened lower and upper halves of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector64 source) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); - Unsafe.SkipInit(out Vector64 upper); for (int i = 0; i < Vector64.Count; i++) { @@ -2153,63 +2188,112 @@ public static unsafe (Vector64 Lower, Vector64 Upper) Widen(Vector lower.SetElementUnsafe(i, value); } - for (int i = Vector64.Count; i < Vector64.Count; i++) + return lower; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector64 WidenUpper(Vector64 source) + { + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) { - var value = (ulong)source.GetElementUnsafe(i); - upper.SetElementUnsafe(i - Vector64.Count, value); + var value = (ushort)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); } - return (lower, upper); + return upper; } - /// Creates a new with the element at the specified index set to the specified value and the remaining elements set to the same value as that in the given vector. - /// The type of the input vector. - /// The vector to get the remaining elements from. - /// The index of the element to set. - /// The value to set the element to. - /// A with the value of the element at set to and the remaining elements set to the same value as that in . - /// The type of () is not supported. - /// was less than zero or greater than the number of elements. [Intrinsic] - public static Vector64 WithElement(this Vector64 vector, int index, T value) - where T : struct + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenUpper(Vector64 source) { - ThrowHelper.ThrowForUnsupportedIntrinsicsVector64BaseType(); + Unsafe.SkipInit(out Vector64 upper); - if ((uint)(index) >= (uint)(Vector64.Count)) + for (int i = Vector64.Count; i < Vector64.Count; i++) { - ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index); + var value = (int)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); } - Vector64 result = vector; - result.SetElementUnsafe(index, value); - return result; + return upper; } - /// Computes the exclusive-or of two vectors. - /// The vector to exclusive-or with . - /// The vector to exclusive-or with . - /// The type of the elements in the vector. - /// The exclusive-or of and . [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector64 Xor(Vector64 left, Vector64 right) - where T : struct => left ^ right; + internal static unsafe Vector64 WidenUpper(Vector64 source) + { + Unsafe.SkipInit(out Vector64 upper); + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (long)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; + } + + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static T GetElementUnsafe(in this Vector64 vector, int index) - where T : struct + internal static unsafe Vector64 WidenUpper(Vector64 source) { - Debug.Assert((index >= 0) && (index < Vector64.Count)); - return Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index); + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (short)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; } + [Intrinsic] [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static void SetElementUnsafe(in this Vector64 vector, int index, T value) - where T : struct + internal static unsafe Vector64 WidenUpper(Vector64 source) { - Debug.Assert((index >= 0) && (index < Vector64.Count)); - Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenUpper(Vector64 source) + { + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; + } + + [Intrinsic] + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static unsafe Vector64 WidenUpper(Vector64 source) + { + Unsafe.SkipInit(out Vector64 upper); + + for (int i = Vector64.Count; i < Vector64.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector64.Count, value); + } + + return upper; } } } From 515c3ce4e299bb45d4a8998cc97c1a85a7f3eafc Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Wed, 6 Oct 2021 15:05:35 -0700 Subject: [PATCH 3/5] Fix some handling of Narrow/Widen hwintrinsics --- src/coreclr/jit/compiler.h | 14 +- src/coreclr/jit/gentree.cpp | 190 ++++++++++++++++++------- src/coreclr/jit/hwintrinsicarm64.cpp | 5 +- src/coreclr/jit/hwintrinsiclistarm64.h | 4 +- 4 files changed, 148 insertions(+), 65 deletions(-) diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 623119d2ea08a..33ee8b025aa64 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3279,17 +3279,11 @@ class Compiler unsigned simdSize, bool isSimdAsHWIntrinsic); - GenTree* gtNewSimdWidenLowerNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize, - bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdWidenLowerNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic); - GenTree* gtNewSimdWidenUpperNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize, - bool isSimdAsHWIntrinsic); + GenTree* gtNewSimdWidenUpperNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic); GenTree* gtNewSimdWithElementNode(var_types type, GenTree* op1, diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 3bc9490498e5d..36cda9fe96050 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -20930,7 +20930,7 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, assert(op2->TypeIs(type)); var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); - assert(varTypeIsArithmetic(simdBaseType)); + assert(varTypeIsArithmetic(simdBaseType) && !varTypeIsLong(simdBaseType)); GenTree* tmp1; GenTree* tmp2; @@ -21069,10 +21069,10 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, // var tmp2 = Avx.ConvertToVector128Single(op2); // return Avx.InsertVector128(tmp1, tmp2, 1); - tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AVX_ConvertToVector128Single, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); - tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_AVX_ConvertToVector128Single, simdBaseJitType, simdSize, - isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AVX_ConvertToVector128Single, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_AVX_ConvertToVector128Single, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_ToVector256Unsafe, simdBaseJitType, 16, isSimdAsHWIntrinsic); @@ -21277,24 +21277,42 @@ GenTree* Compiler::gtNewSimdNarrowNode(var_types type, } } #elif defined(TARGET_ARM64) - if (varTypeIsFloating(simdBaseType)) + if (simdSize == 16) { - // var tmp1 = AdvSimd.Arm64.ConvertToSingleLower(op1); - // return AdvSimd.Arm64.ConvertToSingleUpper(tmp1, op2); + if (varTypeIsFloating(simdBaseType)) + { + // var tmp1 = AdvSimd.Arm64.ConvertToSingleLower(op1); + // return AdvSimd.Arm64.ConvertToSingleUpper(tmp1, op2); - tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_ConvertToSingleLower, simdBaseJitType, 8, - isSimdAsHWIntrinsic); - return gtNewSimdHWIntrinsicNode(type, tmp1, op2, NI_AdvSimd_Arm64_ConvertToSingleUpper, simdBaseJitType, - simdSize, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_ConvertToSingleLower, simdBaseJitType, 8, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, op2, NI_AdvSimd_Arm64_ConvertToSingleUpper, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + } + else + { + // var tmp1 = AdvSimd.ExtractNarrowingLower(op1); + // return AdvSimd.ExtractNarrowingUpper(tmp1, op2); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_ExtractNarrowingLower, simdBaseJitType, 8, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, op2, NI_AdvSimd_ExtractNarrowingUpper, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + } } - else if (simdSize == 16) + else if (varTypeIsFloating(simdBaseType)) { - // var tmp1 = AdvSimd.ExtractNarrowingLower(op1); - // return AdvSimd.ExtractNarrowingUpper(tmp1, op2); + // var tmp1 = op1.ToVector128Unsafe(); + // return AdvSimd.Arm64.ConvertToSingleLower(tmp1); + + CorInfoType tmp2BaseJitType = CORINFO_TYPE_DOUBLE; - tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_ExtractNarrowingLower, simdBaseJitType, 8, + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_Vector64_ToVector128Unsafe, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); - return gtNewSimdHWIntrinsicNode(type, tmp1, op2, NI_AdvSimd_ExtractNarrowingUpper, simdBaseJitType, simdSize, + tmp2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, gtNewIconNode(1), op2, NI_AdvSimd_InsertScalar, + tmp2BaseJitType, 16, isSimdAsHWIntrinsic); + + return gtNewSimdHWIntrinsicNode(type, tmp2, NI_AdvSimd_Arm64_ConvertToSingleLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } else @@ -21455,11 +21473,8 @@ GenTree* Compiler::gtNewSimdUnOpNode(genTreeOps op, } } -GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize, - bool isSimdAsHWIntrinsic) +GenTree* Compiler::gtNewSimdWidenLowerNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic) { assert(IsBaselineSimdIsaSupportedDebugOnly()); @@ -21470,7 +21485,7 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, assert(op1->TypeIs(type)); var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); - assert(varTypeIsArithmetic(simdBaseType)); + assert(varTypeIsArithmetic(simdBaseType) && !varTypeIsLong(simdBaseType)); NamedIntrinsic intrinsic = NI_Illegal; @@ -21482,7 +21497,8 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp1 = + gtNewSimdHWIntrinsicNode(type, op1, NI_Vector256_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); switch (simdBaseType) { @@ -21568,16 +21584,36 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, if (varTypeIsSigned(simdBaseType)) { - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_CompareLessThan, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector widen lower")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_CompareLessThan, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + op1 = op1Dup; } - return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_UnpackLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); } #elif defined(TARGET_ARM64) - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + if (simdSize == 16) + { + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + else + { + assert(simdSize == 8); + tmp1 = op1; + } if (varTypeIsFloating(simdBaseType)) { + assert(simdBaseType == TYP_FLOAT); intrinsic = NI_AdvSimd_Arm64_ConvertToDouble; } else if (varTypeIsSigned(simdBaseType)) @@ -21590,17 +21626,21 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, } assert(intrinsic != NI_Illegal); - return gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, 8, isSimdAsHWIntrinsic); + + if (simdSize == 8) + { + tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_GetLower, simdBaseJitType, 16, isSimdAsHWIntrinsic); + } + + return tmp1; #else #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 } -GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize, - bool isSimdAsHWIntrinsic) +GenTree* Compiler::gtNewSimdWidenUpperNode( + var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize, bool isSimdAsHWIntrinsic) { assert(IsBaselineSimdIsaSupportedDebugOnly()); @@ -21611,19 +21651,20 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, assert(op1->TypeIs(type)); var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); - assert(varTypeIsArithmetic(simdBaseType)); + assert(varTypeIsArithmetic(simdBaseType) && !varTypeIsLong(simdBaseType)); NamedIntrinsic intrinsic = NI_Illegal; -#if defined(TARGET_XARCH) GenTree* tmp1; +#if defined(TARGET_XARCH) if (simdSize == 32) { assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); assert(!varTypeIsIntegral(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2)); - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(1), NI_AVX_ExtractVector128, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(1), NI_AVX_ExtractVector128, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); switch (simdBaseType) { @@ -21672,12 +21713,15 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, nullptr DEBUGARG("Clone op1 for vector widen upper")); - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op1Dup, NI_SSE_MoveHighToLow, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); - return gtNewSimdHWIntrinsicNode(type, tmp1, NI_SSE2_ConvertToVector128Double, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, op1Dup, NI_SSE_MoveHighToLow, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, NI_SSE2_ConvertToVector128Double, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); } else if (compOpportunisticallyDependsOn(InstructionSet_SSE41)) { - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(8), NI_SSE2_ShiftRightLogical128BitLane, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, gtNewIconNode(8), NI_SSE2_ShiftRightLogical128BitLane, + simdBaseJitType, simdSize, isSimdAsHWIntrinsic); switch (simdBaseType) { @@ -21717,27 +21761,71 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, if (varTypeIsSigned(simdBaseType)) { - tmp1 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_CompareLessThan, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(type, simdBaseJitType); + + GenTree* op1Dup; + op1 = impCloneExpr(op1, &op1Dup, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector widen upper")); + + tmp1 = gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_CompareLessThan, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + + op1 = op1Dup; } - return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_UnpackHigh, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_SSE2_UnpackHigh, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); } #elif defined(TARGET_ARM64) - if (varTypeIsFloating(simdBaseType)) - { - intrinsic = NI_AdvSimd_Arm64_ConvertToDoubleUpper; - } - else if (varTypeIsSigned(simdBaseType)) + GenTree* zero; + + if (simdSize == 16) { - intrinsic = NI_AdvSimd_SignExtendWideningUpper; + if (varTypeIsFloating(simdBaseType)) + { + assert(simdBaseType == TYP_FLOAT); + intrinsic = NI_AdvSimd_Arm64_ConvertToDoubleUpper; + } + else if (varTypeIsSigned(simdBaseType)) + { + intrinsic = NI_AdvSimd_SignExtendWideningUpper; + } + else + { + intrinsic = NI_AdvSimd_ZeroExtendWideningUpper; + } + + assert(intrinsic != NI_Illegal); + return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } else { - intrinsic = NI_AdvSimd_ZeroExtendWideningUpper; - } + assert(simdSize == 8); + ssize_t index = 8 / genTypeSize(simdBaseType); - assert(intrinsic != NI_Illegal); - return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + if (varTypeIsFloating(simdBaseType)) + { + assert(simdBaseType == TYP_FLOAT); + intrinsic = NI_AdvSimd_Arm64_ConvertToDouble; + } + else if (varTypeIsSigned(simdBaseType)) + { + intrinsic = NI_AdvSimd_SignExtendWideningLower; + } + else + { + intrinsic = NI_AdvSimd_ZeroExtendWideningLower; + } + + assert(intrinsic != NI_Illegal); + + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); + zero = gtNewSimdZeroNode(TYP_SIMD16, simdBaseJitType, 16, isSimdAsHWIntrinsic); + tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, + simdBaseJitType, 16, isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp1, NI_Vector128_GetLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } #else #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 3fcf4a6528192..8f4d0f0ebf96f 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -825,7 +825,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(retType); op1 = impSIMDPopStack(retType); - retNode = gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + retNode = + gtNewSimdNarrowNode(retType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } @@ -917,7 +918,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector64_WidenUpper: case NI_Vector128_WidenUpper: { - assert(sig->numArgs == 2); + assert(sig->numArgs == 1); op1 = impSIMDPopStack(retType); diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index 97bdd3bb0004a..3f6c90a1315f8 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -173,8 +173,8 @@ HARDWARE_INTRINSIC(Vector128, op_UnaryPlus, HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, WidenUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector128, WidenUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, WithElement, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, Xor, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) From c926d856c0fa1784725ba68d8b8a64340daa58be Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 8 Oct 2021 14:06:03 -0700 Subject: [PATCH 4/5] Ensure that Vector.Widen is still treated as an intrinsic --- .../src/System/Numerics/Vector.cs | 327 +++++++++++------- .../System/Runtime/Intrinsics/Vector128.cs | 14 - .../System/Runtime/Intrinsics/Vector256.cs | 14 - .../src/System/Runtime/Intrinsics/Vector64.cs | 14 - 4 files changed, 206 insertions(+), 163 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector.cs b/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector.cs index 2e030002cd6e0..70c36a619ec63 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Numerics/Vector.cs @@ -1181,210 +1181,295 @@ public static Vector Subtract(Vector left, Vector right) /// A vector that will contain the widened result of the lower half of . /// A vector that will contain the widened result of the upper half of . [CLSCompliant(false)] - [Intrinsic] public static unsafe void Widen(Vector source, out Vector low, out Vector high) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + low = WidenLower(source); + high = WidenUpper(source); + } - for (int i = 0; i < Vector.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); - } + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } - for (int i = Vector.Count; i < Vector.Count; i++) - { - var value = (ushort)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); - } + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + [CLSCompliant(false)] + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } - low = lowerResult; - high = upperResult; + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); } - /// Widens a into two . + /// Widens a into two . /// The vector whose elements are to be widened. /// A vector that will contain the widened result of the lower half of . /// A vector that will contain the widened result of the upper half of . + [CLSCompliant(false)] + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } + + /// Widens a into two . + /// The vector whose elements are to be widened. + /// A vector that will contain the widened result of the lower half of . + /// A vector that will contain the widened result of the upper half of . + [CLSCompliant(false)] [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + public static unsafe void Widen(Vector source, out Vector low, out Vector high) + { + low = WidenLower(source); + high = WidenUpper(source); + } + + /// Computes the exclusive-or of two vectors. + /// The vector to exclusive-or with . + /// The vector to exclusive-or with . + /// The type of the elements in the vector. + /// The exclusive-or of and . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector Xor(Vector left, Vector right) + where T : struct => left ^ right; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static T GetElementUnsafe(in this Vector vector, int index) + where T : struct { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Debug.Assert((index >= 0) && (index < Vector.Count)); + return Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index); + } - for (int i = 0; i < Vector.Count; i++) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static void SetElementUnsafe(in this Vector vector, int index, T value) + where T : struct + { + Debug.Assert((index >= 0) && (index < Vector.Count)); + Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; + } + + [Intrinsic] + internal static Vector WidenLower(Vector source) + { + Unsafe.SkipInit(out Vector lower); + + for (int i = 0; i < Vector.Count; i++) { - var value = (int)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + var value = (ushort)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return lower; + } + + [Intrinsic] + internal static unsafe Vector WidenLower(Vector source) + { + Unsafe.SkipInit(out Vector lower); + + for (int i = 0; i < Vector.Count; i++) { var value = (int)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + lower.SetElementUnsafe(i, value); } - low = lowerResult; - high = upperResult; + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static unsafe Vector WidenLower(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector lower); for (int i = 0; i < Vector.Count; i++) { var value = (long)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + lower.SetElementUnsafe(i, value); } - for (int i = Vector.Count; i < Vector.Count; i++) - { - var value = (long)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); - } - - low = lowerResult; - high = upperResult; + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static unsafe Vector WidenLower(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector lower); for (int i = 0; i < Vector.Count; i++) { var value = (short)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + lower.SetElementUnsafe(i, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return lower; + } + + [Intrinsic] + internal static unsafe Vector WidenLower(Vector source) + { + Unsafe.SkipInit(out Vector lower); + + for (int i = 0; i < Vector.Count; i++) { - var value = (short)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + var value = (double)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); } - low = lowerResult; - high = upperResult; + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static unsafe Vector WidenLower(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector lower); - for (int i = 0; i < Vector.Count; i++) + for (int i = 0; i < Vector.Count; i++) { - var value = (double)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + var value = (uint)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return lower; + } + + [Intrinsic] + internal static unsafe Vector WidenLower(Vector source) + { + Unsafe.SkipInit(out Vector lower); + + for (int i = 0; i < Vector.Count; i++) { - var value = (double)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + var value = (ulong)source.GetElementUnsafe(i); + lower.SetElementUnsafe(i, value); } - low = lowerResult; - high = upperResult; + return lower; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static Vector WidenUpper(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector upper); - for (int i = 0; i < Vector.Count; i++) + for (int i = Vector.Count; i < Vector.Count; i++) { - var value = (uint)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + var value = (ushort)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return upper; + } + + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) + { + Unsafe.SkipInit(out Vector upper); + + for (int i = Vector.Count; i < Vector.Count; i++) { - var value = (uint)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + var value = (int)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); } - low = lowerResult; - high = upperResult; + return upper; } - /// Widens a into two . - /// The vector whose elements are to be widened. - /// A vector that will contain the widened result of the lower half of . - /// A vector that will contain the widened result of the upper half of . - [CLSCompliant(false)] [Intrinsic] - public static unsafe void Widen(Vector source, out Vector low, out Vector high) + internal static unsafe Vector WidenUpper(Vector source) { - Unsafe.SkipInit(out Vector lowerResult); - Unsafe.SkipInit(out Vector upperResult); + Unsafe.SkipInit(out Vector upper); - for (int i = 0; i < Vector.Count; i++) + for (int i = Vector.Count; i < Vector.Count; i++) { - var value = (ulong)source.GetElementUnsafe(i); - lowerResult.SetElementUnsafe(i, value); + var value = (long)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); } - for (int i = Vector.Count; i < Vector.Count; i++) + return upper; + } + + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) + { + Unsafe.SkipInit(out Vector upper); + + for (int i = Vector.Count; i < Vector.Count; i++) { - var value = (ulong)source.GetElementUnsafe(i); - upperResult.SetElementUnsafe(i - Vector.Count, value); + var value = (short)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); } - low = lowerResult; - high = upperResult; + return upper; } - /// Computes the exclusive-or of two vectors. - /// The vector to exclusive-or with . - /// The vector to exclusive-or with . - /// The type of the elements in the vector. - /// The exclusive-or of and . - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector Xor(Vector left, Vector right) - where T : struct => left ^ right; + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) + { + Unsafe.SkipInit(out Vector upper); - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static T GetElementUnsafe(in this Vector vector, int index) - where T : struct + for (int i = Vector.Count; i < Vector.Count; i++) + { + var value = (double)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); + } + + return upper; + } + + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) { - Debug.Assert((index >= 0) && (index < Vector.Count)); - return Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index); + Unsafe.SkipInit(out Vector upper); + + for (int i = Vector.Count; i < Vector.Count; i++) + { + var value = (uint)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); + } + + return upper; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static void SetElementUnsafe(in this Vector vector, int index, T value) - where T : struct + [Intrinsic] + internal static unsafe Vector WidenUpper(Vector source) { - Debug.Assert((index >= 0) && (index < Vector.Count)); - Unsafe.Add(ref Unsafe.As, T>(ref Unsafe.AsRef(in vector)), index) = value; + Unsafe.SkipInit(out Vector upper); + + for (int i = Vector.Count; i < Vector.Count; i++) + { + var value = (ulong)source.GetElementUnsafe(i); + upper.SetElementUnsafe(i - Vector.Count, value); + } + + return upper; } /// diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs index f96e13199ff0b..445a0af111baa 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector128.cs @@ -2838,7 +2838,6 @@ internal static void SetElementUnsafe(in this Vector128 vector, int index, } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static Vector128 WidenLower(Vector128 source) { Unsafe.SkipInit(out Vector128 lower); @@ -2853,7 +2852,6 @@ internal static Vector128 WidenLower(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenLower(Vector128 source) { Unsafe.SkipInit(out Vector128 lower); @@ -2868,7 +2866,6 @@ internal static unsafe Vector128 WidenLower(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenLower(Vector128 source) { Unsafe.SkipInit(out Vector128 lower); @@ -2883,7 +2880,6 @@ internal static unsafe Vector128 WidenLower(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenLower(Vector128 source) { Unsafe.SkipInit(out Vector128 lower); @@ -2898,7 +2894,6 @@ internal static unsafe Vector128 WidenLower(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenLower(Vector128 source) { Unsafe.SkipInit(out Vector128 lower); @@ -2913,7 +2908,6 @@ internal static unsafe Vector128 WidenLower(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenLower(Vector128 source) { Unsafe.SkipInit(out Vector128 lower); @@ -2928,7 +2922,6 @@ internal static unsafe Vector128 WidenLower(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenLower(Vector128 source) { Unsafe.SkipInit(out Vector128 lower); @@ -2943,7 +2936,6 @@ internal static unsafe Vector128 WidenLower(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static Vector128 WidenUpper(Vector128 source) { Unsafe.SkipInit(out Vector128 upper); @@ -2958,7 +2950,6 @@ internal static Vector128 WidenUpper(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenUpper(Vector128 source) { Unsafe.SkipInit(out Vector128 upper); @@ -2973,7 +2964,6 @@ internal static unsafe Vector128 WidenUpper(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenUpper(Vector128 source) { Unsafe.SkipInit(out Vector128 upper); @@ -2988,7 +2978,6 @@ internal static unsafe Vector128 WidenUpper(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenUpper(Vector128 source) { Unsafe.SkipInit(out Vector128 upper); @@ -3003,7 +2992,6 @@ internal static unsafe Vector128 WidenUpper(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenUpper(Vector128 source) { Unsafe.SkipInit(out Vector128 upper); @@ -3018,7 +3006,6 @@ internal static unsafe Vector128 WidenUpper(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenUpper(Vector128 source) { Unsafe.SkipInit(out Vector128 upper); @@ -3033,7 +3020,6 @@ internal static unsafe Vector128 WidenUpper(Vector128 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector128 WidenUpper(Vector128 source) { Unsafe.SkipInit(out Vector128 upper); diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index f2fc2972e75b7..d696075b9c972 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -2952,7 +2952,6 @@ internal static void SetElementUnsafe(in this Vector256 vector, int index, } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static Vector256 WidenLower(Vector256 source) { Unsafe.SkipInit(out Vector256 lower); @@ -2967,7 +2966,6 @@ internal static Vector256 WidenLower(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenLower(Vector256 source) { Unsafe.SkipInit(out Vector256 lower); @@ -2982,7 +2980,6 @@ internal static unsafe Vector256 WidenLower(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenLower(Vector256 source) { Unsafe.SkipInit(out Vector256 lower); @@ -2997,7 +2994,6 @@ internal static unsafe Vector256 WidenLower(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenLower(Vector256 source) { Unsafe.SkipInit(out Vector256 lower); @@ -3012,7 +3008,6 @@ internal static unsafe Vector256 WidenLower(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenLower(Vector256 source) { Unsafe.SkipInit(out Vector256 lower); @@ -3027,7 +3022,6 @@ internal static unsafe Vector256 WidenLower(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenLower(Vector256 source) { Unsafe.SkipInit(out Vector256 lower); @@ -3042,7 +3036,6 @@ internal static unsafe Vector256 WidenLower(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenLower(Vector256 source) { Unsafe.SkipInit(out Vector256 lower); @@ -3057,7 +3050,6 @@ internal static unsafe Vector256 WidenLower(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static Vector256 WidenUpper(Vector256 source) { Unsafe.SkipInit(out Vector256 upper); @@ -3072,7 +3064,6 @@ internal static Vector256 WidenUpper(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenUpper(Vector256 source) { Unsafe.SkipInit(out Vector256 upper); @@ -3087,7 +3078,6 @@ internal static unsafe Vector256 WidenUpper(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenUpper(Vector256 source) { Unsafe.SkipInit(out Vector256 upper); @@ -3102,7 +3092,6 @@ internal static unsafe Vector256 WidenUpper(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenUpper(Vector256 source) { Unsafe.SkipInit(out Vector256 upper); @@ -3117,7 +3106,6 @@ internal static unsafe Vector256 WidenUpper(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenUpper(Vector256 source) { Unsafe.SkipInit(out Vector256 upper); @@ -3132,7 +3120,6 @@ internal static unsafe Vector256 WidenUpper(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenUpper(Vector256 source) { Unsafe.SkipInit(out Vector256 upper); @@ -3147,7 +3134,6 @@ internal static unsafe Vector256 WidenUpper(Vector256 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector256 WidenUpper(Vector256 source) { Unsafe.SkipInit(out Vector256 upper); diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs index aca9c0f2d886a..6783c87b037f8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector64.cs @@ -2087,7 +2087,6 @@ internal static void SetElementUnsafe(in this Vector64 vector, int index, } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); @@ -2102,7 +2101,6 @@ internal static Vector64 WidenLower(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); @@ -2117,7 +2115,6 @@ internal static unsafe Vector64 WidenLower(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); @@ -2132,7 +2129,6 @@ internal static unsafe Vector64 WidenLower(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); @@ -2147,7 +2143,6 @@ internal static unsafe Vector64 WidenLower(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); @@ -2162,7 +2157,6 @@ internal static unsafe Vector64 WidenLower(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); @@ -2177,7 +2171,6 @@ internal static unsafe Vector64 WidenLower(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenLower(Vector64 source) { Unsafe.SkipInit(out Vector64 lower); @@ -2192,7 +2185,6 @@ internal static unsafe Vector64 WidenLower(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static Vector64 WidenUpper(Vector64 source) { Unsafe.SkipInit(out Vector64 upper); @@ -2207,7 +2199,6 @@ internal static Vector64 WidenUpper(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenUpper(Vector64 source) { Unsafe.SkipInit(out Vector64 upper); @@ -2222,7 +2213,6 @@ internal static unsafe Vector64 WidenUpper(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenUpper(Vector64 source) { Unsafe.SkipInit(out Vector64 upper); @@ -2237,7 +2227,6 @@ internal static unsafe Vector64 WidenUpper(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenUpper(Vector64 source) { Unsafe.SkipInit(out Vector64 upper); @@ -2252,7 +2241,6 @@ internal static unsafe Vector64 WidenUpper(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenUpper(Vector64 source) { Unsafe.SkipInit(out Vector64 upper); @@ -2267,7 +2255,6 @@ internal static unsafe Vector64 WidenUpper(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenUpper(Vector64 source) { Unsafe.SkipInit(out Vector64 upper); @@ -2282,7 +2269,6 @@ internal static unsafe Vector64 WidenUpper(Vector64 source) } [Intrinsic] - [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static unsafe Vector64 WidenUpper(Vector64 source) { Unsafe.SkipInit(out Vector64 upper); From c5fb319b23c726bded7de3c1c4d314d8fdece638 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Thu, 14 Oct 2021 11:12:34 -0700 Subject: [PATCH 5/5] Fixing NI_VectorT128_WidenUpper on ARM64 to actually call gtNewSimdWidenUpper --- src/coreclr/jit/simdashwintrinsic.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/simdashwintrinsic.cpp b/src/coreclr/jit/simdashwintrinsic.cpp index 2ccffaea8bdca..7f704cd567292 100644 --- a/src/coreclr/jit/simdashwintrinsic.cpp +++ b/src/coreclr/jit/simdashwintrinsic.cpp @@ -809,7 +809,7 @@ GenTree* Compiler::impSimdAsHWIntrinsicSpecial(NamedIntrinsic intrinsic, case NI_VectorT128_WidenUpper: { - return gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize, + return gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ true); } #else