diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 9784dd6d55bd5d..f81a3a41ba2d27 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -550,13 +550,9 @@ GenTree* Lowering::LowerNode(GenTree* node) return nextNode; } - nextNode = LowerCast(node); - if (nextNode != nullptr) - { - return nextNode; - } + LowerCast(node); + break; } - break; case GT_BITCAST: { diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 388e136f3f7812..10f94f035b663f 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -421,7 +421,7 @@ class Lowering final : public Phase GenTree* switchValue, weight_t defaultLikelihood); - GenTree* LowerCast(GenTree* node); + void LowerCast(GenTree* node); #if !CPU_LOAD_STORE_ARCH bool IsRMWIndirCandidate(GenTree* operand, GenTree* storeInd); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 47fbb10db498cb..e6b5c34181f673 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -968,22 +968,11 @@ void Lowering::LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode) // tree - GT_CAST node to be lowered // // Return Value: -// nextNode to be lowered if tree is modified else returns nullptr -// -// Notes: -// Casts from float/double to a smaller int type are transformed as follows: -// GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte) -// GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte) -// GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16) -// GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16) -// -// Note that for the overflow conversions we still depend on helper calls and -// don't expect to see them here. -// i) GT_CAST(float/double, int type with overflow detection) +// None. // -GenTree* Lowering::LowerCast(GenTree* tree) +void Lowering::LowerCast(GenTree* tree) { - assert(tree->OperGet() == GT_CAST); + assert(tree->OperIs(GT_CAST)); JITDUMP("LowerCast for: "); DISPNODE(tree); @@ -995,17 +984,16 @@ GenTree* Lowering::LowerCast(GenTree* tree) if (varTypeIsFloating(srcType)) { + // Overflow casts should have been converted to helper call in morph. noway_assert(!tree->gtOverflow()); - assert(!varTypeIsSmall(dstType)); // fgMorphCast creates intermediate casts when converting from float to small - // int. + // Small types should have had an intermediate int cast inserted in morph. + assert(!varTypeIsSmall(dstType)); } assert(!varTypeIsSmall(srcType)); // Now determine if we have operands that should be contained. ContainCheckCast(tree->AsCast()); - - return nullptr; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerloongarch64.cpp b/src/coreclr/jit/lowerloongarch64.cpp index ef06d2ce41791b..b2725dff07167c 100644 --- a/src/coreclr/jit/lowerloongarch64.cpp +++ b/src/coreclr/jit/lowerloongarch64.cpp @@ -514,21 +514,9 @@ void Lowering::LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode) // Return Value: // None. // -// Notes: -// Casts from float/double to a smaller int type are transformed as follows: -// GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte) -// GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte) -// GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16) -// GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16) -// -// Note that for the overflow conversions we still depend on helper calls and -// don't expect to see them here. -// i) GT_CAST(float/double, int type with overflow detection) -// - -GenTree* Lowering::LowerCast(GenTree* tree) +void Lowering::LowerCast(GenTree* tree) { - assert(tree->OperGet() == GT_CAST); + assert(tree->OperIs(GT_CAST)); JITDUMP("LowerCast for: "); DISPNODE(tree); @@ -540,17 +528,16 @@ GenTree* Lowering::LowerCast(GenTree* tree) if (varTypeIsFloating(srcType)) { + // Overflow casts should have been converted to helper call in morph. noway_assert(!tree->gtOverflow()); - assert(!varTypeIsSmall(dstType)); // fgMorphCast creates intermediate casts when converting from float to small - // int. + // Small types should have had an intermediate int cast inserted in morph. + assert(!varTypeIsSmall(dstType)); } assert(!varTypeIsSmall(srcType)); // Now determine if we have operands that should be contained. ContainCheckCast(tree->AsCast()); - - return nullptr; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerriscv64.cpp b/src/coreclr/jit/lowerriscv64.cpp index 6a1cffd5ca0426..6265d1fce34023 100644 --- a/src/coreclr/jit/lowerriscv64.cpp +++ b/src/coreclr/jit/lowerriscv64.cpp @@ -437,21 +437,9 @@ void Lowering::LowerPutArgStkOrSplit(GenTreePutArgStk* putArgNode) // Return Value: // None. // -// Notes: -// Casts from float/double to a smaller int type are transformed as follows: -// GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte) -// GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte) -// GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16) -// GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16) -// -// Note that for the overflow conversions we still depend on helper calls and -// don't expect to see them here. -// i) GT_CAST(float/double, int type with overflow detection) -// - -GenTree* Lowering::LowerCast(GenTree* tree) +void Lowering::LowerCast(GenTree* tree) { - assert(tree->OperGet() == GT_CAST); + assert(tree->OperIs(GT_CAST)); JITDUMP("LowerCast for: "); DISPNODE(tree); @@ -463,17 +451,16 @@ GenTree* Lowering::LowerCast(GenTree* tree) if (varTypeIsFloating(srcType)) { + // Overflow casts should have been converted to helper call in morph. noway_assert(!tree->gtOverflow()); - assert(!varTypeIsSmall(dstType)); // fgMorphCast creates intermediate casts when converting from float to small - // int. + // Small types should have had an intermediate int cast inserted in morph. + assert(!varTypeIsSmall(dstType)); } assert(!varTypeIsSmall(srcType)); // Now determine if we have operands that should be contained. ContainCheckCast(tree->AsCast()); - - return nullptr; } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index fe9c7a4073a1df..264539170b8ef4 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -847,38 +847,22 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) #endif // TARGET_X86 } -/* Lower GT_CAST(srcType, DstType) nodes. - * - * Casts from small int type to float/double are transformed as follows: - * GT_CAST(byte, float/double) = GT_CAST(GT_CAST(byte, int32), float/double) - * GT_CAST(sbyte, float/double) = GT_CAST(GT_CAST(sbyte, int32), float/double) - * GT_CAST(int16, float/double) = GT_CAST(GT_CAST(int16, int32), float/double) - * GT_CAST(uint16, float/double) = GT_CAST(GT_CAST(uint16, int32), float/double) - * - * Unless the EVEX conversion instructions are available, casts from Uint32 - * are morphed as follows by front-end and hence should not be seen here. - * GT_CAST(uint32, float/double) = GT_CAST(GT_CAST(uint32, long), float/double) - * - * - * Similarly casts from float/double to a smaller int type are transformed as follows: - * GT_CAST(float/double, byte) = GT_CAST(GT_CAST(float/double, int32), byte) - * GT_CAST(float/double, sbyte) = GT_CAST(GT_CAST(float/double, int32), sbyte) - * GT_CAST(float/double, int16) = GT_CAST(GT_CAST(double/double, int32), int16) - * GT_CAST(float/double, uint16) = GT_CAST(GT_CAST(double/double, int32), uint16) - * - * Note that for the following conversions we still depend on helper calls and - * don't expect to see them here. - * i) GT_CAST(float/double, uint64) when EVEX is not available - * ii) GT_CAST(float/double, int type with overflow detection) - */ -GenTree* Lowering::LowerCast(GenTree* tree) +//------------------------------------------------------------------------ +// LowerCast: Lower GT_CAST(srcType, DstType) nodes. +// +// Arguments: +// tree - GT_CAST node to be lowered +// +// Return Value: +// None. +// +void Lowering::LowerCast(GenTree* tree) { assert(tree->OperIs(GT_CAST)); - GenTree* castOp = tree->AsCast()->CastOp(); - var_types castToType = tree->CastToType(); - var_types dstType = castToType; - var_types srcType = castOp->TypeGet(); + GenTree* castOp = tree->AsCast()->CastOp(); + var_types dstType = tree->CastToType(); + var_types srcType = castOp->TypeGet(); // force the srcType to unsigned if GT_UNSIGNED flag is set if (tree->IsUnsigned()) @@ -886,345 +870,449 @@ GenTree* Lowering::LowerCast(GenTree* tree) srcType = varTypeToUnsigned(srcType); } - // We should not see the following casts unless directly supported by hardware, - // as they are expected to be lowered appropriately or converted into helper calls by front-end. - // srcType = float/double castToType = * and overflow detecting cast - // Reason: must be converted to a helper call - // srcType = float/double, castToType = ulong - // Reason: must be converted to a helper call - // srcType = float/double, castToType = byte/sbyte/ushort/short - // Reason: must have intermediate cast to int - // srcType = uint castToType = float/double - // Reason: uint -> float/double = uint -> long -> float/double if (varTypeIsFloating(srcType)) { + // Overflow casts should have been converted to helper call in morph. noway_assert(!tree->gtOverflow()); + // Small types should have had an intermediate int cast inserted in morph. assert(!varTypeIsSmall(dstType)); - assert(castToType != TYP_ULONG || comp->canUseEvexEncodingDebugOnly()); + // Long types should have been handled by helper call or in DecomposeLongs on x86. + assert(!varTypeIsLong(dstType) || TargetArchitecture::Is64Bit); } else if (srcType == TYP_UINT) { - assert(castToType != TYP_FLOAT || comp->canUseEvexEncodingDebugOnly()); + // uint->float casts should have an intermediate cast to long unless + // we have the EVEX unsigned conversion instructions available. + assert(dstType != TYP_FLOAT || comp->canUseEvexEncodingDebugOnly()); } -#if defined(TARGET_AMD64) - // Handle saturation logic for X64 - // Let InstructionSet_AVX10v2 pass through since it can handle the saturation - if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType) && !varTypeIsSmall(dstType) && +#ifdef FEATURE_HW_INTRINSICS + if (varTypeIsFloating(srcType) && varTypeIsIntegral(dstType) && !comp->compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) { - // We should have filtered out float -> long conversion and - // converted it to float -> double -> long conversion. - assert((dstType != TYP_LONG) || (srcType != TYP_FLOAT)); + // If we don't have AVX10v2 saturating conversion instructions for + // floating->integral, we have to handle the saturation logic here. - // we should have handled overflow cases in morph itself - assert(!tree->gtOverflow()); + JITDUMP("LowerCast before:\n"); + DISPTREERANGE(BlockRange(), tree); - CorInfoType fieldType = (srcType == TYP_DOUBLE) ? CORINFO_TYPE_DOUBLE : CORINFO_TYPE_FLOAT; - GenTree* castOutput = nullptr; - LIR::Use castOpUse(BlockRange(), &(tree->AsCast()->CastOp()), tree); - ReplaceWithLclVar(castOpUse); - castOp = tree->AsCast()->CastOp(); - bool isV512Supported = false; - /*The code below is to introduce saturating conversions on X86/X64. - The C# equivalence of the code is given below --> - // Replace QNaN and SNaN with Zero - op1 = Avx512F.Fixup(op1, op1, Vector128.Create(0x88), 0); - // Convert from double to long, replacing any values that were greater than or equal to MaxValue - with MaxValue - // Values that were less than or equal to MinValue will already be MinValue - return Vector128.ConditionalSelect( - Vector128.LessThan(op1, Vector128.Create(long.MaxValue)).AsInt64(), - Avx512DQ.VL.ConvertToVector128Int64(op1), - Vector128.Create(long.MaxValue) - ); - */ - if (comp->compIsEvexOpportunisticallySupported(isV512Supported)) - { - // Clone the cast operand for usage. - GenTree* op1Clone1 = comp->gtClone(castOp); - BlockRange().InsertAfter(castOp, op1Clone1); - - // Generate the control table for VFIXUPIMMSD - // The behavior we want is to saturate negative values to 0. - GenTreeVecCon* tbl = comp->gtNewVconNode(TYP_SIMD16); - tbl->gtSimdVal.i32[0] = (varTypeIsUnsigned(dstType)) ? 0x08080088 : 0x00000088; - BlockRange().InsertAfter(op1Clone1, tbl); - - // get a zero int node for control table - GenTree* ctrlByte = comp->gtNewIconNode(0); - BlockRange().InsertAfter(tbl, ctrlByte); - - NamedIntrinsic fixupHwIntrinsicID = !isV512Supported ? NI_AVX10v1_FixupScalar : NI_AVX512F_FixupScalar; - if (varTypeIsUnsigned(dstType)) - { - // run vfixupimmsd base on table and no flags reporting - GenTree* oper1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castOp, op1Clone1, tbl, ctrlByte, - fixupHwIntrinsicID, fieldType, 16); - BlockRange().InsertAfter(ctrlByte, oper1); - LowerNode(oper1); - - // Convert to scalar - // Here, we try to insert a Vector128 to Scalar node so that the input - // can be provided to the scalar cast - GenTree* oper2 = comp->gtNewSimdHWIntrinsicNode(srcType, oper1, NI_Vector128_ToScalar, fieldType, 16); - BlockRange().InsertAfter(oper1, oper2); - LowerNode(oper2); - - castOutput = comp->gtNewCastNode(genActualType(dstType), oper2, false, dstType); - BlockRange().InsertAfter(oper2, castOutput); - } - else - { - CorInfoType destFieldType = (dstType == TYP_INT) ? CORINFO_TYPE_INT : CORINFO_TYPE_LONG; - - ssize_t actualMaxVal = (dstType == TYP_INT) ? INT32_MAX : INT64_MAX; - - // run vfixupimmsd base on table and no flags reporting - GenTree* fixupVal = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, castOp, op1Clone1, tbl, ctrlByte, - fixupHwIntrinsicID, fieldType, 16); - BlockRange().InsertAfter(ctrlByte, fixupVal); - LowerNode(fixupVal); - - // get the max value vector - GenTree* maxValScalar = (srcType == TYP_DOUBLE) - ? comp->gtNewDconNodeD(static_cast(actualMaxVal)) - : comp->gtNewDconNodeF(static_cast(actualMaxVal)); - GenTree* maxVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxValScalar, fieldType, 16); - BlockRange().InsertAfter(fixupVal, maxVal); - - GenTree* maxValDstTypeScalar = (dstType == TYP_INT) ? comp->gtNewIconNode(actualMaxVal, dstType) - : comp->gtNewLconNode(actualMaxVal); - GenTree* maxValDstType = - comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxValDstTypeScalar, destFieldType, 16); - BlockRange().InsertAfter(maxVal, maxValDstType); - - // usage 1 --> compare with max value of integer - GenTree* compMask = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, fixupVal, maxVal, fieldType, 16); - BlockRange().InsertAfter(maxValDstType, compMask); - - // convert fixupVal to local variable and clone it for further use - LIR::Use fixupValUse(BlockRange(), &(compMask->AsHWIntrinsic()->Op(1)), compMask); - ReplaceWithLclVar(fixupValUse); - fixupVal = compMask->AsHWIntrinsic()->Op(1); - GenTree* fixupValClone = comp->gtClone(fixupVal); - LowerNode(compMask); - BlockRange().InsertAfter(fixupVal, fixupValClone); - - GenTree* FixupValCloneScalar = - comp->gtNewSimdHWIntrinsicNode(srcType, fixupValClone, NI_Vector128_ToScalar, fieldType, 16); - BlockRange().InsertAfter(compMask, FixupValCloneScalar); - LowerNode(FixupValCloneScalar); - - // cast it - GenTreeCast* newCast = comp->gtNewCastNode(dstType, FixupValCloneScalar, false, dstType); - BlockRange().InsertAfter(FixupValCloneScalar, newCast); - - GenTree* newTree = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, newCast, destFieldType, 16); - BlockRange().InsertAfter(newCast, newTree); - LowerNode(newTree); - - // usage 2 --> use the compared mask with input value and max value to blend - GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A) - BlockRange().InsertAfter(newTree, control); - GenTree* cndSelect = comp->gtNewSimdTernaryLogicNode(TYP_SIMD16, compMask, maxValDstType, newTree, - control, destFieldType, 16); - BlockRange().InsertAfter(control, cndSelect); - LowerNode(cndSelect); - - castOutput = - comp->gtNewSimdHWIntrinsicNode(dstType, cndSelect, NI_Vector128_ToScalar, destFieldType, 16); - BlockRange().InsertAfter(cndSelect, castOutput); - LowerNode(castOutput); - } - } - else if (varTypeIsSigned(dstType) && comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) - { - CorInfoType destFieldType = (dstType == TYP_INT) ? CORINFO_TYPE_INT : CORINFO_TYPE_LONG; - - ssize_t actualMaxVal = (dstType == TYP_INT) ? INT32_MAX : INT64_MAX; - - // create clones for usage - GenTree* castOpClone1 = comp->gtClone(castOp); - GenTree* castOpClone2 = comp->gtClone(castOp); - BlockRange().InsertAfter(castOp, castOpClone1); - BlockRange().InsertAfter(castOpClone1, castOpClone2); - - GenTree* oper = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOp, fieldType, 16); - BlockRange().InsertAfter(castOpClone2, oper); - LowerNode(oper); - GenTree* op1Clone1 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone1, fieldType, 16); - BlockRange().InsertAfter(oper, op1Clone1); - LowerNode(op1Clone1); - GenTree* op1Clone2 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone2, fieldType, 16); - BlockRange().InsertAfter(op1Clone1, op1Clone2); - LowerNode(op1Clone2); - - // check NaN - GenTree* mask1 = comp->gtNewSimdCmpOpNode(GT_EQ, TYP_SIMD16, oper, op1Clone1, fieldType, 16); - BlockRange().InsertAfter(op1Clone2, mask1); - LowerNode(mask1); - // inp = inp & mask - GenTree* maskNaN = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, op1Clone2, mask1, fieldType, 16); - BlockRange().InsertAfter(mask1, maskNaN); - LowerNode(maskNaN); - - // get the max value vector - GenTree* maxVal = (srcType == TYP_DOUBLE) ? comp->gtNewDconNodeD(static_cast(actualMaxVal)) - : comp->gtNewDconNodeF(static_cast(actualMaxVal)); - GenTree* maxValDup = - (dstType == TYP_INT) ? comp->gtNewIconNode(actualMaxVal) : comp->gtNewLconNode(actualMaxVal); - maxVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxVal, fieldType, 16); - BlockRange().InsertAfter(maskNaN, maxVal); - maxValDup = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxValDup, destFieldType, 16); - BlockRange().InsertAfter(maxVal, maxValDup); - - // usage 1 --> compare with max value of integer - GenTree* compMask = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, maskNaN, maxVal, fieldType, 16); - BlockRange().InsertAfter(maxValDup, compMask); - - // we will be using the maskNaN value twice - LIR::Use maskNaNUse(BlockRange(), &(compMask->AsHWIntrinsic()->Op(1)), compMask); - ReplaceWithLclVar(maskNaNUse); - maskNaN = compMask->AsHWIntrinsic()->Op(1); - GenTree* maskNaNClone = comp->gtClone(maskNaN); - LowerNode(compMask); - BlockRange().InsertAfter(maskNaN, maskNaNClone); - - // convert to scalar for conversion - GenTree* maskNaNCloneScalar = - comp->gtNewSimdHWIntrinsicNode(srcType, maskNaNClone, NI_Vector128_ToScalar, fieldType, 16); - BlockRange().InsertAfter(compMask, maskNaNCloneScalar); - LowerNode(maskNaNCloneScalar); - - // cast it - GenTreeCast* newCast = comp->gtNewCastNode(dstType, maskNaNCloneScalar, false, dstType); - BlockRange().InsertAfter(maskNaNCloneScalar, newCast); - GenTree* newTree = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, newCast, destFieldType, 16); - BlockRange().InsertAfter(newCast, newTree); - LowerNode(newTree); - - // usage 2 --> use thecompared mask with input value and max value to blend - GenTree* cndSelect = comp->gtNewSimdCndSelNode(TYP_SIMD16, compMask, maxValDup, newTree, destFieldType, 16); - BlockRange().InsertAfter(newTree, cndSelect); - LowerNode(cndSelect); - - castOutput = comp->gtNewSimdHWIntrinsicNode(dstType, cndSelect, NI_Vector128_ToScalar, destFieldType, 16); - BlockRange().InsertAfter(cndSelect, castOutput); - LowerNode(castOutput); - } - else + CorInfoType srcBaseType = (srcType == TYP_FLOAT) ? CORINFO_TYPE_FLOAT : CORINFO_TYPE_DOUBLE; + LIR::Range castRange = LIR::EmptyRange(); + + // We'll be using SIMD instructions to fix up castOp before conversion. + // + // This creates the equivalent of the following C# code: + // var srcVec = Vector128.CreateScalarUnsafe(castOp); + + GenTree* srcVector = comp->gtNewSimdCreateScalarUnsafeNode(TYP_SIMD16, castOp, srcBaseType, 16); + castRange.InsertAtEnd(srcVector); + + if (srcVector->IsCnsVec()) { - // The remaining case not handled above should be conversion - // to TYP_UINT in case where SSE41 is supported. - // We should have converted float -> uint conversion to - // float -> double -> uint during morph. - assert((dstType == TYP_UINT) && comp->compIsaSupportedDebugOnly(InstructionSet_SSE41) && - (srcType != TYP_FLOAT)); - - ssize_t actualMaxVal = UINT32_MAX; - CorInfoType destFieldType = CORINFO_TYPE_LONG; - - GenTree* castOpClone1 = comp->gtClone(castOp); - GenTree* castOpClone2 = comp->gtClone(castOp); - GenTree* castOpClone3 = comp->gtClone(castOp); - BlockRange().InsertAfter(castOp, castOpClone1); - BlockRange().InsertAfter(castOpClone1, castOpClone2); - BlockRange().InsertAfter(castOpClone2, castOpClone3); - - GenTree* oper = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOp, fieldType, 16); - BlockRange().InsertAfter(castOpClone3, oper); - LowerNode(oper); - GenTree* op1Clone1 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone1, fieldType, 16); - BlockRange().InsertAfter(oper, op1Clone1); - LowerNode(op1Clone1); - GenTree* op1Clone2 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone2, fieldType, 16); - BlockRange().InsertAfter(op1Clone1, op1Clone2); - LowerNode(op1Clone2); - GenTree* op1Clone3 = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, castOpClone3, fieldType, 16); - BlockRange().InsertAfter(op1Clone2, op1Clone3); - LowerNode(op1Clone3); - - // get the max/min value vector - GenTree* minVal = comp->gtNewDconNodeD(static_cast(0)); - minVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, minVal, fieldType, 16); - BlockRange().InsertAfter(op1Clone3, minVal); - GenTree* maxVal = comp->gtNewDconNodeD(static_cast(actualMaxVal)); - maxVal = comp->gtNewSimdCreateBroadcastNode(TYP_SIMD16, maxVal, fieldType, 16); - BlockRange().InsertAfter(minVal, maxVal); - - // check NaN - GenTree* mask1 = comp->gtNewSimdCmpOpNode(GT_EQ, TYP_SIMD16, oper, op1Clone1, fieldType, 16); - BlockRange().InsertAfter(maxVal, mask1); - LowerNode(mask1); - - // check negative - GenTree* mask2 = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, op1Clone2, minVal, fieldType, 16); - BlockRange().InsertAfter(mask1, mask2); - LowerNode(mask2); - - // and mask - GenTree* mask12 = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, mask1, mask2, fieldType, 16); - BlockRange().InsertAfter(mask2, mask12); - LowerNode(mask12); - - // inp = inp & mask - GenTree* saturatedVal = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, op1Clone3, mask12, fieldType, 16); - BlockRange().InsertAfter(mask12, saturatedVal); - LowerNode(saturatedVal); - - // compare with max value of uint - GenTree* mask3 = comp->gtNewSimdCmpOpNode(GT_GE, TYP_SIMD16, saturatedVal, maxVal, fieldType, 16); - BlockRange().InsertAfter(saturatedVal, mask3); - - // Convert both the operands of mask3 to local variables for reusage - LIR::Use saturatedValUse(BlockRange(), &(mask3->AsHWIntrinsic()->Op(1)), mask3); - ReplaceWithLclVar(saturatedValUse); - saturatedVal = mask3->AsHWIntrinsic()->Op(1); - GenTree* saturatedValDup = comp->gtClone(saturatedVal); - BlockRange().InsertAfter(saturatedVal, saturatedValDup); - - LIR::Use maxValUse(BlockRange(), &(mask3->AsHWIntrinsic()->Op(2)), mask3); - ReplaceWithLclVar(maxValUse); - maxVal = mask3->AsHWIntrinsic()->Op(2); - GenTree* maxValDup = comp->gtClone(maxVal); - LowerNode(mask3); - BlockRange().InsertAfter(maxVal, maxValDup); - - // Select based on mask3 - GenTree* castOpVal = - comp->gtNewSimdCndSelNode(TYP_SIMD16, mask3, maxValDup, saturatedValDup, fieldType, 16); - BlockRange().InsertAfter(mask3, castOpVal); - LowerNode(castOpVal); - - // scalar - GenTree* castOpValScalar = - comp->gtNewSimdHWIntrinsicNode(srcType, castOpVal, NI_Vector128_ToScalar, fieldType, 16); - BlockRange().InsertAfter(castOpVal, castOpValScalar); - LowerNode(castOpValScalar); - - // cast it - castOutput = comp->gtNewCastNode(TYP_INT, castOpValScalar, false, dstType); - BlockRange().InsertAfter(castOpValScalar, castOutput); - } - assert(castOutput != nullptr); - LIR::Use use; - if (BlockRange().TryGetUse(tree, &use)) + castOp->SetUnusedValue(); + } + + if (varTypeIsUnsigned(dstType) && comp->canUseEvexEncoding()) { - use.ReplaceWith(castOutput); + // EVEX unsigned conversion instructions saturate positive overflow properly, so as + // long as we fix up NaN and negative values, we can preserve the existing cast node. + // + // maxs[sd] will take the value from the second operand if the first operand's value is + // NaN, which allows us to fix up both negative and NaN values with a single instruction. + // + // This creates the equivalent of the following C# code: + // castOp = Sse.MaxScalar(srcVec, Vector128.Zero).ToScalar(); + + NamedIntrinsic maxScalarIntrinsic = (srcType == TYP_FLOAT) ? NI_SSE_MaxScalar : NI_SSE2_MaxScalar; + + GenTree* zero = comp->gtNewZeroConNode(TYP_SIMD16); + GenTree* fixupVal = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, maxScalarIntrinsic, srcBaseType, 16); + + GenTree* toScalar = comp->gtNewSimdToScalarNode(srcType, fixupVal, srcBaseType, 16); + + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(fixupVal); + castRange.InsertAtEnd(toScalar); + + tree->AsCast()->CastOp() = toScalar; } else { - castOutput->SetUnusedValue(); + assert(comp->IsBaselineSimdIsaSupportedDebugOnly()); + assert(!TargetArchitecture::Is64Bit || comp->compIsaSupportedDebugOnly(InstructionSet_SSE2_X64)); + + // We need to fix up NaN as well as handle possible overflow. Signed conversions + // return int/long.MinValue for any overflow, which is correct for saturation of + // negative, but the result must be replaced with MaxValue for positive overflow. + + CorInfoType dstBaseType = CORINFO_TYPE_UNDEF; + NamedIntrinsic convertIntrinsic = NI_Illegal; + GenTree* maxIntegralValue = nullptr; + GenTree* maxFloatingValue = comp->gtNewVconNode(TYP_SIMD16); + simd_t* maxFloatSimdVal = &maxFloatingValue->AsVecCon()->gtSimdVal; + + switch (dstType) + { + case TYP_INT: + { + dstBaseType = CORINFO_TYPE_INT; + maxIntegralValue = comp->gtNewIconNode(INT32_MAX); + if (srcType == TYP_FLOAT) + { + maxFloatSimdVal->f32[0] = 2147483648.0f; + convertIntrinsic = NI_SSE_ConvertToInt32WithTruncation; + } + else + { + maxFloatSimdVal->f64[0] = 2147483648.0; + convertIntrinsic = NI_SSE2_ConvertToInt32WithTruncation; + } + break; + } + case TYP_UINT: + { + dstBaseType = CORINFO_TYPE_UINT; + maxIntegralValue = comp->gtNewIconNode(static_cast(UINT32_MAX)); + if (srcType == TYP_FLOAT) + { + maxFloatSimdVal->f32[0] = 4294967296.0f; + convertIntrinsic = comp->compOpportunisticallyDependsOn(InstructionSet_SSE_X64) + ? NI_SSE_X64_ConvertToInt64WithTruncation + : NI_SSE2_ConvertToVector128Int32WithTruncation; + } + else + { + maxFloatSimdVal->f64[0] = 4294967296.0; + convertIntrinsic = comp->compOpportunisticallyDependsOn(InstructionSet_SSE2_X64) + ? NI_SSE2_X64_ConvertToInt64WithTruncation + : NI_SSE2_ConvertToVector128Int32WithTruncation; + } + break; + } + case TYP_LONG: + { + dstBaseType = CORINFO_TYPE_LONG; + maxIntegralValue = comp->gtNewLconNode(INT64_MAX); + if (srcType == TYP_FLOAT) + { + maxFloatSimdVal->f32[0] = 9223372036854775808.0f; + convertIntrinsic = NI_SSE_X64_ConvertToInt64WithTruncation; + } + else + { + maxFloatSimdVal->f64[0] = 9223372036854775808.0; + convertIntrinsic = NI_SSE2_X64_ConvertToInt64WithTruncation; + } + break; + } + case TYP_ULONG: + { + dstBaseType = CORINFO_TYPE_ULONG; + maxIntegralValue = comp->gtNewLconNode(static_cast(UINT64_MAX)); + if (srcType == TYP_FLOAT) + { + maxFloatSimdVal->f32[0] = 18446744073709551616.0f; + convertIntrinsic = NI_SSE_X64_ConvertToInt64WithTruncation; + } + else + { + maxFloatSimdVal->f64[0] = 18446744073709551616.0; + convertIntrinsic = NI_SSE2_X64_ConvertToInt64WithTruncation; + } + break; + } + default: + { + unreached(); + } + } + + // We will use the input value at least twice, so we preemptively replace it with a lclVar. + LIR::Use srcUse; + LIR::Use::MakeDummyUse(castRange, srcVector, &srcUse); + srcUse.ReplaceWithLclVar(comp); + srcVector = srcUse.Def(); + + GenTree* srcClone = nullptr; + GenTree* convertResult = nullptr; + + if (varTypeIsSigned(dstType)) + { + // Fix up NaN values before conversion. Saturation is handled after conversion, + // because MaxValue may not be precisely representable in the floating format. + // + // This creates the equivalent of the following C# code: + // var nanMask = Sse.CompareScalarOrdered(srcVec, srcVec); + // var fixupVal = Sse.And(srcVec, nanMask); + // convertResult = Sse.ConvertToInt32WithTruncation(fixupVal); + + NamedIntrinsic compareNaNIntrinsic = + (srcType == TYP_FLOAT) ? NI_SSE_CompareScalarOrdered : NI_SSE2_CompareScalarOrdered; + + srcClone = comp->gtClone(srcVector); + GenTree* nanMask = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, srcClone, compareNaNIntrinsic, + srcBaseType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(nanMask); + + srcClone = comp->gtClone(srcVector); + GenTree* fixupVal = comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, nanMask, srcClone, srcBaseType, 16); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(fixupVal); + + convertResult = comp->gtNewSimdHWIntrinsicNode(dstType, fixupVal, convertIntrinsic, srcBaseType, 16); + } + else + { + // maxs[sd] will take the value from the second operand if the first operand's value is + // NaN, which allows us to fix up both negative and NaN values with a single instruction. + // + // This creates the equivalent of the following C# code: + // var fixupVal = Sse.MaxScalar(srcVec, Vector128.Zero); + + NamedIntrinsic maxScalarIntrinsic = (srcType == TYP_FLOAT) ? NI_SSE_MaxScalar : NI_SSE2_MaxScalar; + + GenTree* zero = comp->gtNewZeroConNode(TYP_SIMD16); + GenTree* fixupVal = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcVector, zero, maxScalarIntrinsic, srcBaseType, 16); + + castRange.InsertAtEnd(zero); + castRange.InsertAtEnd(fixupVal); + + if ((dstType == TYP_UINT) && ((convertIntrinsic == NI_SSE_X64_ConvertToInt64WithTruncation) || + (convertIntrinsic == NI_SSE2_X64_ConvertToInt64WithTruncation))) + { + // On x64, we can use long conversion to handle uint directly. + convertResult = + comp->gtNewSimdHWIntrinsicNode(TYP_LONG, fixupVal, convertIntrinsic, srcBaseType, 16); + } + else + { + // We're doing a conversion that isn't supported directly by hardware. We will emulate + // the unsigned conversion by using the signed instruction on both the fixed-up input + // value and a negative value that has the same bit representation when converted to + // integer. If the conversion overflows as a signed integer, the negative conversion + // result is selected. + // + // This creates the equivalent of the following C# code: + // var wrapVal = Sse.SubtractScalar(srcVec, maxFloatingValue); + + NamedIntrinsic subtractIntrinsic = + (srcType == TYP_FLOAT) ? NI_SSE_SubtractScalar : NI_SSE2_SubtractScalar; + + // We're going to use maxFloatingValue twice, so replace the constant with a lclVar. + castRange.InsertAtEnd(maxFloatingValue); + + LIR::Use maxFloatUse; + LIR::Use::MakeDummyUse(castRange, maxFloatingValue, &maxFloatUse); + maxFloatUse.ReplaceWithLclVar(comp); + maxFloatingValue = maxFloatUse.Def(); + + GenTree* floorVal = comp->gtClone(srcVector); + castRange.InsertAtEnd(floorVal); + + if ((srcType == TYP_DOUBLE) && (dstType == TYP_UINT)) + { + // This technique works only if the truncating conversion of the positive and negative + // values causes them to round in the same direction. i.e. there is no rounding, because + // we have a whole number. This is always true if the exponent is larger than the number + // of significand bits, which will always be the case for double->ulong or float->uint. + // + // For double->uint, the double has enough precision to exactly represent any whole number + // in range, with bits left over. e.g. we might have a value of 4294967295.9999995. + // We must, therefore, truncate the value before wrapping it to negative. + + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // This creates the equivalent of the following C# code: + // floorVal = Sse41.RoundToZeroScalar(srcVector); + + floorVal = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, NI_SSE41_RoundToZeroScalar, + srcBaseType, 16); + castRange.InsertAtEnd(floorVal); + } + else + { + // We don't have `roundsd` available, but we can truncate the value by simply zeroing out + // the low 21 bits of the double. This works because we know we will only use the negative + // value when the exponent is exactly 31, meaning 31 of the 52 bits in the significand are + // used for the whole portion of the number, and the remaining 21 bits are fractional. + // + // This creates the equivalent of the following C# code: + // floorVal = ((srcVector.AsUInt64() >>> 21) << 21).AsDouble(); + + GenTree* twentyOne = comp->gtNewIconNode(21); + GenTree* rightShift = comp->gtNewSimdBinOpNode(GT_RSZ, TYP_SIMD16, floorVal, twentyOne, + CORINFO_TYPE_ULONG, 16); + castRange.InsertAtEnd(twentyOne); + castRange.InsertAtEnd(rightShift); + + twentyOne = comp->gtClone(twentyOne); + floorVal = comp->gtNewSimdBinOpNode(GT_LSH, TYP_SIMD16, rightShift, twentyOne, + CORINFO_TYPE_ULONG, 16); + castRange.InsertAtEnd(twentyOne); + castRange.InsertAtEnd(floorVal); + } + } + + GenTree* wrapVal = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, floorVal, maxFloatingValue, + subtractIntrinsic, srcBaseType, 16); + castRange.InsertAtEnd(wrapVal); + + maxFloatingValue = comp->gtClone(maxFloatingValue); + + if (dstType == TYP_UINT) + { + // We can keep the conversion results in SIMD registers to make selection of the + // correct result simpler. + // + // This creates the equivalent of the following C# code: + // var result = Sse2.ConvertToVector128Int32WithTruncation(fixupVal); + // var negated = Sse2.ConvertToVector128Int32WithTruncation(wrapVal); + + GenTree* result = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, fixupVal, convertIntrinsic, srcBaseType, 16); + GenTree* negated = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, wrapVal, convertIntrinsic, srcBaseType, 16); + + castRange.InsertAtEnd(result); + castRange.InsertAtEnd(negated); + + // We need the result twice -- one for the mask bit and one for the blend. + LIR::Use resultUse; + LIR::Use::MakeDummyUse(castRange, result, &resultUse); + resultUse.ReplaceWithLclVar(comp); + result = resultUse.Def(); + + GenTree* resultClone = comp->gtClone(result); + castRange.InsertAtEnd(resultClone); + + if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) + { + // If the conversion of the fixed-up value overflowed, the result wil be + // int.MinValue. Since `blendvps` uses only the MSB for result selection, + // this is adequate to force selection of the negated result. + // + // This creates the equivalent of the following C# code: + // convertResult = Sse41.BlendVariable(result, negated, result); + + convertResult = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, result, negated, resultClone, + NI_SSE41_BlendVariable, CORINFO_TYPE_FLOAT, 16); + } + else + { + // If we can't use `blendvps`, we do a bit-wise selection. This works + // using only and+or because if we choose the negated value, both it + // and the overflowed result have MSB set. + // + // This creates the equivalent of the following C# code: + // var mask = Sse2.ShiftRightArithmetic(result, 31); + // convertResult = Sse.Or(result, Sse.And(negated, mask)); + + GenTree* thirtyOne = comp->gtNewIconNode(31); + GenTree* mask = + comp->gtNewSimdBinOpNode(GT_RSH, TYP_SIMD16, result, thirtyOne, CORINFO_TYPE_INT, 16); + GenTree* andMask = + comp->gtNewSimdBinOpNode(GT_AND, TYP_SIMD16, mask, negated, dstBaseType, 16); + + castRange.InsertAtEnd(thirtyOne); + castRange.InsertAtEnd(mask); + castRange.InsertAtEnd(andMask); + + convertResult = + comp->gtNewSimdBinOpNode(GT_OR, TYP_SIMD16, andMask, resultClone, dstBaseType, 16); + } + + // Because the results are in a SIMD register, we need to ToScalar() them out. + castRange.InsertAtEnd(convertResult); + convertResult = comp->gtNewSimdToScalarNode(TYP_INT, convertResult, dstBaseType, 16); + } + else + { + assert(dstType == TYP_ULONG); + + // We're emulating floating->ulong conversion on x64. The logic is the same as for + // uint on x86, except that we don't have conversion instructions that keep the + // results in SIMD registers, so we do the final result selection in scalar code. + // + // This creates the equivalent of the following C# code: + // long result = Sse.X64.ConvertToInt64WithTruncation(fixupVal); + // long negated = Sse.X64.ConvertToInt64WithTruncation(wrapVal); + // convertResult = (ulong)(result | (negated & (result >> 63))); + + GenTree* result = + comp->gtNewSimdHWIntrinsicNode(TYP_LONG, fixupVal, convertIntrinsic, srcBaseType, 16); + GenTree* negated = + comp->gtNewSimdHWIntrinsicNode(TYP_LONG, wrapVal, convertIntrinsic, srcBaseType, 16); + + castRange.InsertAtEnd(result); + castRange.InsertAtEnd(negated); + + // We need the result twice -- once for the mask bit and once for the blend. + LIR::Use resultUse; + LIR::Use::MakeDummyUse(castRange, result, &resultUse); + resultUse.ReplaceWithLclVar(comp); + result = resultUse.Def(); + + GenTree* sixtyThree = comp->gtNewIconNode(63); + GenTree* mask = comp->gtNewOperNode(GT_RSH, TYP_LONG, result, sixtyThree); + GenTree* andMask = comp->gtNewOperNode(GT_AND, TYP_LONG, mask, negated); + GenTree* resultClone = comp->gtClone(result); + + castRange.InsertAtEnd(sixtyThree); + castRange.InsertAtEnd(mask); + castRange.InsertAtEnd(andMask); + castRange.InsertAtEnd(resultClone); + + convertResult = comp->gtNewOperNode(GT_OR, TYP_LONG, andMask, resultClone); + } + } + } + + // Now we handle saturation of the result for positive overflow. + // + // This creates the equivalent of the following C# code: + // bool isOverflow = Sse.CompareScalarUnorderedGreaterThanOrEqual(srcVec, maxFloatingValue); + // return isOverflow ? maxIntegralValue : convertResult; + + NamedIntrinsic compareIntrinsic = (srcType == TYP_FLOAT) ? NI_SSE_CompareScalarUnorderedGreaterThanOrEqual + : NI_SSE2_CompareScalarUnorderedGreaterThanOrEqual; + + // These nodes were all created above but not used until now. + castRange.InsertAtEnd(maxFloatingValue); + castRange.InsertAtEnd(maxIntegralValue); + castRange.InsertAtEnd(convertResult); + + srcClone = comp->gtClone(srcVector); + GenTree* compareMax = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, srcClone, maxFloatingValue, + compareIntrinsic, srcBaseType, 16); + GenTree* select = comp->gtNewConditionalNode(GT_SELECT, compareMax, maxIntegralValue, convertResult, + genActualType(dstType)); + + castRange.InsertAtEnd(srcClone); + castRange.InsertAtEnd(compareMax); + castRange.InsertAtEnd(select); + + // The original cast becomes a no-op, because its input is already the correct type. + tree->AsCast()->CastOp() = select; } - BlockRange().Remove(tree); - return castOutput->gtNext; + + LIR::ReadOnlyRange lowerRange(castRange.FirstNode(), castRange.LastNode()); + BlockRange().InsertBefore(tree, std::move(castRange)); + + JITDUMP("LowerCast after:\n"); + DISPTREERANGE(BlockRange(), tree); + + LowerRange(lowerRange); } -#endif // TARGET_AMD64 +#endif // FEATURE_HW_INTRINSICS // Now determine if we have operands that should be contained. ContainCheckCast(tree->AsCast()); - return nullptr; } #ifdef FEATURE_HW_INTRINSICS diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 79b9d2743c9375..c63e28b38e7ef7 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -299,21 +299,14 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) // This goes through helper and hence src needs to be converted to double. && tree->gtOverflow() #elif defined(TARGET_AMD64) - // Amd64: src = float, dst = uint64 or overflow conversion. - // src needs to be converted to double except for the following cases - // dstType = int/uint/ulong for AVX512F - // dstType = int for SSE41 - // For pre-SSE41, the all src is converted to TYP_DOUBLE - // and goes through helpers. - && - (tree->gtOverflow() || (dstType == TYP_LONG && !compOpportunisticallyDependsOn(InstructionSet_AVX10v2)) || - !(canUseEvexEncoding() || (dstType == TYP_INT && compOpportunisticallyDependsOn(InstructionSet_SSE41)))) + // Amd64: src = float, dst = overflow conversion or SSE2 is not enabled + && (tree->gtOverflow() || !IsBaselineSimdIsaSupported()) #elif defined(TARGET_ARM) // Arm: src = float, dst = int64/uint64 or overflow conversion. && (tree->gtOverflow() || varTypeIsLong(dstType)) #else - // x86: src = float, dst = uint32/int64/uint64 or overflow conversion. - && (tree->gtOverflow() || varTypeIsIntegral(dstType)) + // x86: src = float, dst = int64/uint64 or overflow conversion or SSE2 is not enabled + && (tree->gtOverflow() || varTypeIsLong(dstType) || !IsBaselineSimdIsaSupported()) #endif ) { @@ -339,25 +332,12 @@ GenTree* Compiler::fgMorphExpandCast(GenTreeCast* tree) #if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) return nullptr; #else -#if defined(TARGET_AMD64) - // Following nodes are handled when lowering the nodes - // float -> ulong/uint/int/long for AVX10.2 - // double -> ulong/uint/int/long for AVX10.2 - // float -> ulong/uint/int for AVX512F - // double -> ulong/uint/long/int for AVX512F - // float -> int for SSE41 - // double -> int/uint/long for SSE41 - // For all other conversions, we use helper functions. - if (canUseEvexEncoding() || - ((dstType != TYP_ULONG) && compOpportunisticallyDependsOn(InstructionSet_SSE41))) - { - if (tree->CastOp() != oper) - { - tree->CastOp() = oper; - } +#if defined(TARGET_XARCH) + if (IsBaselineSimdIsaSupported() && (!varTypeIsLong(dstType) || TargetArchitecture::Is64Bit)) + { return nullptr; } -#endif // TARGET_AMD64 +#endif // TARGET_XARCH switch (dstType) { case TYP_INT: