From f71ea561204604e1e3d2a8a49930781a4fa24c6d Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 28 Jun 2024 12:36:38 -0700 Subject: [PATCH 1/4] Centralize the folding logic for ConditionalSelect and ensure side effects aren't dropped --- src/coreclr/jit/gentree.cpp | 129 +++++++---- src/coreclr/jit/hwintrinsic.cpp | 13 -- src/coreclr/jit/hwintrinsiclistxarch.h | 6 +- src/coreclr/jit/lowerxarch.cpp | 283 ++++++++++++++++++------- src/coreclr/jit/morph.cpp | 20 +- src/coreclr/jit/valuenum.cpp | 123 +++++++---- 6 files changed, 388 insertions(+), 186 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ce92e74c0175c..7e69c011f45b7 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -23435,18 +23435,14 @@ GenTree* Compiler::gtNewSimdCndSelNode( NamedIntrinsic intrinsic = NI_Illegal; #if defined(TARGET_XARCH) - assert((simdSize != 32) || compIsaSupportedDebugOnly(InstructionSet_AVX)); - - if (canUseEvexEncoding()) + if (simdSize == 64) { - GenTree* control = gtNewIconNode(0xCA); // (B & A) | (C & ~A) - return gtNewSimdTernaryLogicNode(type, op1, op2, op3, control, simdBaseJitType, simdSize); + assert(canUseEvexEncodingDebugOnly()); + intrinsic = NI_Vector512_ConditionalSelect; } - - assert(simdSize != 64); - - if (simdSize == 32) + else if (simdSize == 32) { + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); intrinsic = NI_Vector256_ConditionalSelect; } else @@ -26562,45 +26558,19 @@ GenTree* Compiler::gtNewSimdUnOpNode( { if (simdSize == 64) { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX512F)); - - if (genTypeSize(simdBaseType) >= 4) - { - intrinsic = NI_AVX512F_TernaryLogic; - } - } - else - { - if (simdSize == 32) - { - assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); - } - if ((genTypeSize(simdBaseType) >= 4) && compOpportunisticallyDependsOn(InstructionSet_AVX10v1)) - { - intrinsic = NI_AVX10v1_TernaryLogic; - } - else if ((genTypeSize(simdBaseType) >= 4) && compOpportunisticallyDependsOn(InstructionSet_AVX512F_VL)) - { - intrinsic = NI_AVX512F_VL_TernaryLogic; - } + assert(canUseEvexEncodingDebugOnly()); + intrinsic = NI_Vector512_op_OnesComplement; } - - if (intrinsic != NI_Illegal) + else if (simdSize == 32) { - // AVX512 allows performing `not` without requiring a memory access - assert(canUseEvexEncodingDebugOnly()); - - op2 = gtNewZeroConNode(type); - op3 = gtNewZeroConNode(type); - - GenTree* cns = gtNewIconNode(static_cast(~0xAA)); // ~C - return gtNewSimdTernaryLogicNode(type, op3, op2, op1, cns, simdBaseJitType, simdSize); + assert(compIsaSupportedDebugOnly(InstructionSet_AVX)); + intrinsic = NI_Vector256_op_OnesComplement; } else { - op2 = gtNewAllBitsSetConNode(type); - return gtNewSimdBinOpNode(GT_XOR, type, op1, op2, simdBaseJitType, simdSize); + intrinsic = NI_Vector128_op_OnesComplement; } + return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); } #elif defined(TARGET_ARM64) case GT_NEG: @@ -28150,12 +28120,16 @@ genTreeOps GenTreeHWIntrinsic::HWOperGet(bool* isScalar) const return GT_AND; } -#if defined(TARGET_ARM64) +#if defined(TARGET_XARCH) + case NI_Vector128_op_OnesComplement: + case NI_Vector256_op_OnesComplement: + case NI_Vector512_op_OnesComplement: +#elif defined(TARGET_ARM64) case NI_AdvSimd_Not: +#endif { return GT_NOT; } -#endif #if defined(TARGET_XARCH) case NI_SSE_Xor: @@ -30512,6 +30486,73 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } } + if (op3 != nullptr) + { + switch (ni) + { + case NI_Vector128_ConditionalSelect: +#if defined(TARGET_XARCH) + case NI_Vector256_ConditionalSelect: + case NI_Vector512_ConditionalSelect: +#elif defined(TARGET_ARM64) + case NI_Vector64_ConditionalSelect: + case NI_Sve_ConditionalSelect: +#endif + { + if (cnsNode != op1) + { + break; + } + + if (op1->IsVectorAllBitsSet() || op1->IsMaskAllBitsSet()) + { + if ((op3->gtFlags & GTF_SIDE_EFFECT) == 0) + { + // op3 has no side effects, so we can return op2 directly + return op2; + } + + if ((op2->gtFlags & GTF_SIDE_EFFECT) == 0) + { + // op2 has no side effects, but op3 does, we need to wrap + // so that the side effects for op3 are preserved + return gtWrapWithSideEffects(op2, op3, GTF_ALL_EFFECT); + } + + // op2 and op3 both have side effects, we would have to evaluate op2 + // into a local and then wrap the op3 with side effects returning that + // local, which isn't safe to do from the general purpose handler here + break; + } + + if (op1->IsVectorZero()) + { + return gtWrapWithSideEffects(op3, op2, GTF_ALL_EFFECT); + } + + if (op2->IsCnsVec() && op3->IsCnsVec()) + { + // op2 = op2 & op1 + op2->AsVecCon()->EvaluateBinaryInPlace(GT_AND, false, simdBaseType, op1->AsVecCon()); + + // op3 = op2 & ~op1 + op3->AsVecCon()->EvaluateBinaryInPlace(GT_AND_NOT, false, simdBaseType, op1->AsVecCon()); + + // op2 = op2 | op3 + op2->AsVecCon()->EvaluateBinaryInPlace(GT_OR, false, simdBaseType, op3->AsVecCon()); + + resultNode = op2; + } + break; + } + + default: + { + break; + } + } + } + if (resultNode != tree) { if (fgGlobalMorph) diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 6f2fe7b68aded..718091443588f 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -1903,19 +1903,6 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, switch (intrinsic) { - case NI_Sve_ConditionalSelect: - { - if (op1->IsVectorAllBitsSet() || op1->IsMaskAllBitsSet()) - { - return retNode->AsHWIntrinsic()->Op(2); - } - else if (op1->IsVectorZero()) - { - return retNode->AsHWIntrinsic()->Op(3); - } - break; - } - case NI_Sve_GetActiveElementCount: case NI_Sve_TestAnyTrue: case NI_Sve_TestFirstTrue: diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index c19511f750869..02365054fd171 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -119,7 +119,7 @@ HARDWARE_INTRINSIC(Vector128, op_ExclusiveOr, HARDWARE_INTRINSIC(Vector128, op_Inequality, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) HARDWARE_INTRINSIC(Vector128, op_LeftShift, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, op_Multiply, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(Vector128, op_OnesComplement, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector128, op_OnesComplement, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, op_RightShift, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, op_Subtraction, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector128, op_UnaryNegation, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) @@ -221,7 +221,7 @@ HARDWARE_INTRINSIC(Vector256, op_ExclusiveOr, HARDWARE_INTRINSIC(Vector256, op_Inequality, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) HARDWARE_INTRINSIC(Vector256, op_LeftShift, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector256, op_Multiply, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(Vector256, op_OnesComplement, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, op_OnesComplement, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, op_RightShift, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector256, op_Subtraction, 32, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector256, op_UnaryNegation, 32, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) @@ -321,7 +321,7 @@ HARDWARE_INTRINSIC(Vector512, op_ExclusiveOr, HARDWARE_INTRINSIC(Vector512, op_Inequality, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) HARDWARE_INTRINSIC(Vector512, op_LeftShift, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector512, op_Multiply, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) -HARDWARE_INTRINSIC(Vector512, op_OnesComplement, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector512, op_OnesComplement, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, op_RightShift, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector512, op_Subtraction, 64, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) HARDWARE_INTRINSIC(Vector512, op_UnaryNegation, 64, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 514f3ea3e69e4..7975cfe0a7dbf 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1415,10 +1415,194 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) } } + bool isScalar = false; + genTreeOps oper = node->HWOperGet(&isScalar); + + switch (oper) + { + case GT_AND: + case GT_OR: + case GT_XOR: + { + if (!comp->canUseEvexEncoding()) + { + break; + } + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + // search for structure like: + /* + /- A + +- B + t1 = binary logical op1 + + /- C + +- t1 + t2 = binary logical op2 + */ + GenTree* second = use.User(); + if (!second->OperIs(GT_HWINTRINSIC)) + { + break; + } + + bool nestedIsScalar = false; + genTreeOps nestedOper = second->AsHWIntrinsic()->HWOperGet(&isScalar); + + if (nestedOper == GT_NONE) + { + // TODO: We should support cases like CNDSEL + break; + } + + if (nestedIsScalar) + { + break; + } + + if ((nestedOper != GT_AND) && (nestedOper != GT_OR) && (nestedOper != GT_XOR)) + { + // TODO: We should support other cases like AND_NOT, NOT, and CNDSEL + break; + } + + GenTree* op3 = second->AsHWIntrinsic()->Op(1) == node ? second->AsHWIntrinsic()->Op(2) + : second->AsHWIntrinsic()->Op(1); + GenTree* control = comp->gtNewIconNode(node->GetTernaryControlByte(second->AsHWIntrinsic())); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + GenTree* ternaryNode = + comp->gtNewSimdTernaryLogicNode(simdType, op1, op2, op3, control, simdBaseJitType, simdSize); + BlockRange().InsertBefore(second, control, ternaryNode); + LIR::Use finalRes; + if (BlockRange().TryGetUse(second, &finalRes)) + { + finalRes.ReplaceWith(ternaryNode); + } + else + { + ternaryNode->SetUnusedValue(); + } + GenTree* next = node->gtNext; + BlockRange().Remove(node); + BlockRange().Remove(second); + return next; + } + break; + } + + case GT_NOT: + { + var_types simdType = node->TypeGet(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + GenTree* op1 = node->Op(1); + + assert(varTypeIsSIMD(simdType)); + + bool isV512Supported = false; + if ((genTypeSize(simdBaseType) >= 4) && comp->compIsEvexOpportunisticallySupported(isV512Supported)) + { + // We can't use the mask, but we can emit a ternary logic node + NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic; + + if (simdSize != 64) + { + ternaryLogicId = !isV512Supported ? NI_AVX10v1_TernaryLogic : NI_AVX512F_VL_TernaryLogic; + } + + GenTree* op2 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op2); + + GenTree* op3 = comp->gtNewZeroConNode(simdType); + BlockRange().InsertBefore(node, op3); + + GenTree* control = comp->gtNewIconNode(static_cast(~0xAA)); // ~C + BlockRange().InsertBefore(node, control); + + node->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, control); + } + else + { + NamedIntrinsic xorId; + + if (simdSize == 64) + { + assert(comp->IsBaselineVector512IsaSupportedDebugOnly()); + + if (varTypeIsFloating(simdBaseType)) + { + xorId = NI_AVX512DQ_Xor; + } + else + { + xorId = NI_AVX512F_Xor; + } + } + else if (simdSize == 32) + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + + if (varTypeIsFloating(simdBaseType)) + { + xorId = NI_AVX_Xor; + } + else if (comp->compOpportunisticallyDependsOn(InstructionSet_AVX2)) + { + xorId = NI_AVX2_Xor; + } + else + { + // Since this is a bitwise operation, we can still support it by lying + // about the type and doing the operation using a supported instruction + + xorId = NI_AVX_Xor; + + if (varTypeIsLong(simdBaseType)) + { + node->SetSimdBaseJitType(CORINFO_TYPE_DOUBLE); + } + else + { + node->SetSimdBaseJitType(CORINFO_TYPE_FLOAT); + } + } + } + else if (simdBaseType == TYP_FLOAT) + { + xorId = NI_SSE_Xor; + } + else + { + xorId = NI_SSE2_Xor; + } + + GenTree* cns = comp->gtNewAllBitsSetConNode(simdType); + BlockRange().InsertBefore(node, cns); + + node->ResetHWIntrinsicId(xorId, op1, cns); + } + + return LowerNode(node); + } + + default: + { + break; + } + } + switch (intrinsicId) { case NI_Vector128_ConditionalSelect: case NI_Vector256_ConditionalSelect: + case NI_Vector512_ConditionalSelect: { return LowerHWIntrinsicCndSel(node); } @@ -2137,86 +2321,6 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) LowerFusedMultiplyAdd(node); break; - case NI_SSE_And: - case NI_SSE2_And: - case NI_AVX_And: - case NI_AVX2_And: - case NI_AVX512F_And: - case NI_AVX512DQ_And: - case NI_SSE_Or: - case NI_SSE2_Or: - case NI_AVX_Or: - case NI_AVX2_Or: - case NI_AVX512F_Or: - case NI_AVX512DQ_Or: - case NI_SSE_Xor: - case NI_SSE2_Xor: - case NI_AVX_Xor: - case NI_AVX2_Xor: - case NI_AVX512F_Xor: - case NI_AVX512DQ_Xor: - case NI_AVX10v1_V512_And: - case NI_AVX10v1_V512_Or: - case NI_AVX10v1_V512_Xor: - { - if (!comp->IsBaselineVector512IsaSupportedOpportunistically()) - { - break; - } - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); - - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - // search for structure like: - /* - /- A - +- B - t1 = binary logical op1 - - /- C - +- t1 - t2 = binary logical op2 - */ - GenTree* second = use.User(); - if (!second->OperIs(GT_HWINTRINSIC) || !second->AsHWIntrinsic()->OperIsBitwiseHWIntrinsic()) - { - break; - } - - bool isScalar = false; - if ((second->AsHWIntrinsic()->HWOperGet(&isScalar) == GT_AND_NOT) || isScalar) - { - // currently ANDNOT logic cannot be optimized by the ternary node. - break; - } - GenTree* op3 = second->AsHWIntrinsic()->Op(1) == node ? second->AsHWIntrinsic()->Op(2) - : second->AsHWIntrinsic()->Op(1); - GenTree* control = comp->gtNewIconNode(node->GetTernaryControlByte(second->AsHWIntrinsic())); - CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); - unsigned simdSize = node->GetSimdSize(); - var_types simdType = Compiler::getSIMDTypeForSize(simdSize); - GenTree* ternaryNode = - comp->gtNewSimdTernaryLogicNode(simdType, op1, op2, op3, control, simdBaseJitType, simdSize); - BlockRange().InsertBefore(second, control, ternaryNode); - LIR::Use finalRes; - if (BlockRange().TryGetUse(second, &finalRes)) - { - finalRes.ReplaceWith(ternaryNode); - } - else - { - ternaryNode->SetUnusedValue(); - } - GenTree* next = node->gtNext; - BlockRange().Remove(node); - BlockRange().Remove(second); - return next; - } - break; - } - case NI_AVX512F_TernaryLogic: case NI_AVX512F_VL_TernaryLogic: case NI_AVX10v1_TernaryLogic: @@ -3035,7 +3139,26 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) } } + bool isV512Supported = false; + if (comp->compIsEvexOpportunisticallySupported(isV512Supported)) + { + // We can't use the mask, but we can emit a ternary logic node + NamedIntrinsic ternaryLogicId = NI_AVX512F_TernaryLogic; + + if (simdSize != 64) + { + ternaryLogicId = !isV512Supported ? NI_AVX10v1_TernaryLogic : NI_AVX512F_VL_TernaryLogic; + } + + GenTree* control = comp->gtNewIconNode(0xCA); // (B & A) | (C & ~A) + BlockRange().InsertBefore(node, control); + + node->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, control); + return LowerNode(node); + } + // We cannot optimize, so produce unoptimized instructions + assert(simdSize != 64); // We will be constructing the following parts: // /--* op1 simd16 diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index e5ac22723ab9d..3c2301f2d7b39 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9936,13 +9936,8 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) if (GenTreeHWIntrinsic::OperIsBitwiseHWIntrinsic(oper)) { - if (oper == GT_NOT) - { - break; - } - GenTree* op1 = node->Op(1); - GenTree* op2 = node->Op(2); + GenTree* op2 = (oper == GT_NOT) ? op1 : node->Op(2); if (!op1->OperIsHWIntrinsic() || !op2->OperIsHWIntrinsic()) { @@ -9990,6 +9985,12 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case GT_NOT: + { + maskIntrinsicId = NI_EVEX_NotMask; + break; + } + case GT_OR: { maskIntrinsicId = NI_EVEX_OrMask; @@ -10025,8 +10026,11 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) node->Op(1) = cvtOp1->Op(1); DEBUG_DESTROY_NODE(op1); - node->Op(2) = cvtOp2->Op(1); - DEBUG_DESTROY_NODE(op2); + if (oper != GT_NOT) + { + node->Op(2) = cvtOp2->Op(1); + DEBUG_DESTROY_NODE(op2); + } node = gtNewSimdCvtMaskToVectorNode(simdType, node, simdBaseJitType, simdSize)->AsHWIntrinsic(); diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index cb6bd6b1f6bf5..8c8b215cf9298 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -8573,61 +8573,108 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunTernary(GenTreeHWIntrinsic* tree, var_types baseType = tree->GetSimdBaseType(); NamedIntrinsic ni = tree->GetHWIntrinsicId(); - if (IsVNConstant(arg0VN) && IsVNConstant(arg1VN) && IsVNConstant(arg2VN)) + switch (ni) { - switch (ni) - { - case NI_Vector128_WithElement: -#ifdef TARGET_ARM64 - case NI_Vector64_WithElement: -#else - case NI_Vector256_WithElement: - case NI_Vector512_WithElement: + case NI_Vector128_ConditionalSelect: +#if defined(TARGET_XARCH) + case NI_Vector256_ConditionalSelect: + case NI_Vector512_ConditionalSelect: +#elif defined(TARGET_ARM64) + case NI_Vector64_ConditionalSelect: + case NI_Sve_ConditionalSelect: #endif + { + if (IsVNConstant(arg0VN)) { - int32_t index = GetConstantInt32(arg1VN); + // Handle `0 ? x : y` + ValueNum zeroVN = VNZeroForType(type); - if (static_cast(index) >= GenTreeVecCon::ElementCount(genTypeSize(type), baseType)) + if (arg0VN == zeroVN) { - // Nothing to fold for out of range indexes - break; + return arg2VN; } - if (varTypeIsFloating(baseType)) - { - double value; + // Handle `AllBitsSet ? x : y` + ValueNum allBitsVN = VNAllBitsForType(type); - if (baseType == TYP_FLOAT) - { - value = GetConstantSingle(arg2VN); - } - else - { - value = GetConstantDouble(arg2VN); - } - return EvaluateSimdWithElementFloating(this, type, baseType, arg0VN, index, value); + if (arg0VN == allBitsVN) + { + return arg1VN; } - else + + if (IsVNConstant(arg1VN) && IsVNConstant(arg2VN)) { - assert(varTypeIsIntegral(baseType)); - int64_t value; + // (y & x) | (z & ~x) - if (varTypeIsLong(baseType)) - { - value = GetConstantInt64(arg2VN); - } - else - { - value = GetConstantInt32(arg2VN); - } - return EvaluateSimdWithElementIntegral(this, type, baseType, arg0VN, index, value); + ValueNum trueVN = EvaluateBinarySimd(this, GT_AND, false, type, baseType, arg1VN, arg0VN); + ValueNum falseVN = EvaluateBinarySimd(this, GT_AND_NOT, false, type, baseType, arg2VN, arg0VN); + + return EvaluateBinarySimd(this, GT_OR, false, type, baseType, trueVN, falseVN); } } + else if (arg1VN == arg2VN) + { + // Handle `x ? y : y` + return arg1VN; + } + break; + } - default: + case NI_Vector128_WithElement: +#ifdef TARGET_ARM64 + case NI_Vector64_WithElement: +#else + case NI_Vector256_WithElement: + case NI_Vector512_WithElement: +#endif + { + if (!IsVNConstant(arg0VN) || !IsVNConstant(arg1VN) || !IsVNConstant(arg2VN)) + { + break; + } + + int32_t index = GetConstantInt32(arg1VN); + + if (static_cast(index) >= GenTreeVecCon::ElementCount(genTypeSize(type), baseType)) { + // Nothing to fold for out of range indexes break; } + + if (varTypeIsFloating(baseType)) + { + double value; + + if (baseType == TYP_FLOAT) + { + value = GetConstantSingle(arg2VN); + } + else + { + value = GetConstantDouble(arg2VN); + } + return EvaluateSimdWithElementFloating(this, type, baseType, arg0VN, index, value); + } + else + { + assert(varTypeIsIntegral(baseType)); + int64_t value; + + if (varTypeIsLong(baseType)) + { + value = GetConstantInt64(arg2VN); + } + else + { + value = GetConstantInt32(arg2VN); + } + return EvaluateSimdWithElementIntegral(this, type, baseType, arg0VN, index, value); + } + } + + default: + { + break; } } From 554e3cc14b4be8fbfdb3e60d21a2dc07574627f7 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 28 Jun 2024 13:19:31 -0700 Subject: [PATCH 2/4] Ensure CndSel handles 64-bit operands where possible --- src/coreclr/jit/lowerxarch.cpp | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 7975cfe0a7dbf..05dbd9894be7d 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3078,8 +3078,6 @@ GenTree* Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cm // GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) { - assert(!comp->canUseEvexEncodingDebugOnly()); - var_types simdType = node->gtType; CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); var_types simdBaseType = node->GetSimdBaseType(); @@ -3102,17 +3100,38 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) // we can optimize the entire conditional select to // a single BlendVariable instruction (if supported by the architecture) - // TODO-XARCH-AVX512 Use VPBLENDM* and take input directly from K registers if cond is from MoveMaskToVectorSpecial. // First, determine if the condition is a per-element mask if (op1->OperIsHWIntrinsic() && HWIntrinsicInfo::ReturnsPerElementMask(op1->AsHWIntrinsic()->GetHWIntrinsicId())) { // Next, determine if the target architecture supports BlendVariable NamedIntrinsic blendVariableId = NI_Illegal; - // For Vector256 (simdSize == 32), BlendVariable for floats/doubles is available on AVX, whereas other types - // require AVX2 - if (simdSize == 32) + bool isOp1CvtMaskToVector = op1->AsHWIntrinsic()->OperIsConvertMaskToVector(); + + if ((simdSize == 64) || isOp1CvtMaskToVector) { + GenTree* maskNode; + + if (isOp1CvtMaskToVector) + { + maskNode = op1->AsHWIntrinsic()->Op(1); + BlockRange().Remove(op1); + } + else + { + maskNode = comp->gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseJitType, simdSize); + BlockRange().InsertBefore(node, maskNode); + } + + assert(maskNode->TypeGet() == TYP_MASK); + blendVariableId = NI_EVEX_BlendVariableMask; + op1 = maskNode; + } + else if (simdSize == 32) + { + // For Vector256 (simdSize == 32), BlendVariable for floats/doubles + // is available on AVX, whereas other types (integrals) require AVX2 + if (varTypeIsFloating(simdBaseType)) { // This should have already been confirmed @@ -3124,9 +3143,9 @@ GenTree* Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) blendVariableId = NI_AVX2_BlendVariable; } } - // For Vector128, BlendVariable is available on SSE41 else if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41)) { + // For Vector128, BlendVariable is available on SSE41 blendVariableId = NI_SSE41_BlendVariable; } From ca61703ce4c446c416120a0f61632af6031f8837 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 28 Jun 2024 17:42:15 -0700 Subject: [PATCH 3/4] Don't fold if op3 has side effects --- src/coreclr/jit/gentree.cpp | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 7e69c011f45b7..7b54c004dc33a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -30506,23 +30506,15 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) if (op1->IsVectorAllBitsSet() || op1->IsMaskAllBitsSet()) { - if ((op3->gtFlags & GTF_SIDE_EFFECT) == 0) + if ((op3->gtFlags & GTF_SIDE_EFFECT) != 0) { - // op3 has no side effects, so we can return op2 directly - return op2; + // op3 has side effects, this would require us to append a new statement + // to ensure that it isn't lost, which isn't safe to do from the general + // purpose handler here. We'll recognize this and mark it in VN instead } - if ((op2->gtFlags & GTF_SIDE_EFFECT) == 0) - { - // op2 has no side effects, but op3 does, we need to wrap - // so that the side effects for op3 are preserved - return gtWrapWithSideEffects(op2, op3, GTF_ALL_EFFECT); - } - - // op2 and op3 both have side effects, we would have to evaluate op2 - // into a local and then wrap the op3 with side effects returning that - // local, which isn't safe to do from the general purpose handler here - break; + // op3 has no side effects, so we can return op2 directly + return op2; } if (op1->IsVectorZero()) From d400cf378dea5541c044f87d28021b30cbb3c92b Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 28 Jun 2024 18:45:29 -0700 Subject: [PATCH 4/4] Ensure that operands are passed into the lowered not->ternarylogic correctly --- src/coreclr/jit/lowerxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 05dbd9894be7d..45270cf20b065 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1526,7 +1526,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) GenTree* control = comp->gtNewIconNode(static_cast(~0xAA)); // ~C BlockRange().InsertBefore(node, control); - node->ResetHWIntrinsicId(ternaryLogicId, comp, op1, op2, op3, control); + node->ResetHWIntrinsicId(ternaryLogicId, comp, op3, op2, op1, control); } else {