From 3aa1ec5bb1f50f0a1bed9cfcac8734f742bcf24b Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 13 Dec 2024 18:49:19 -0800 Subject: [PATCH] Improve codegen for Vector512.ExtractMostSignificatBits (#110662) --- src/coreclr/jit/decomposelongs.cpp | 105 +++++++++++++++++++++++++++ src/coreclr/jit/decomposelongs.h | 1 + src/coreclr/jit/hwintrinsicxarch.cpp | 17 ++--- 3 files changed, 111 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/decomposelongs.cpp b/src/coreclr/jit/decomposelongs.cpp index 84802400feeb0..9280f459f978b 100644 --- a/src/coreclr/jit/decomposelongs.cpp +++ b/src/coreclr/jit/decomposelongs.cpp @@ -1707,6 +1707,11 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsic(LIR::Use& use) return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree); } + case NI_EVEX_MoveMask: + { + return DecomposeHWIntrinsicMoveMask(use, hwintrinsicTree); + } + default: { noway_assert(!"unexpected GT_HWINTRINSIC node in long decomposition"); @@ -1830,6 +1835,106 @@ GenTree* DecomposeLongs::DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHW return FinalizeDecomposition(use, loResult, hiResult, hiResult); } +//------------------------------------------------------------------------ +// DecomposeHWIntrinsicMoveMask: Decompose GT_HWINTRINSIC -- NI_EVEX_MoveMask +// +// Decompose a MoveMask(x) node on Vector512<*>. For: +// +// GT_HWINTRINSIC{MoveMask}[*](simd_var) +// +// create: +// +// tmp_simd_var = simd_var +// tmp_simd_lo = GT_HWINTRINSIC{GetLower}(tmp_simd_var) +// lo_result = GT_HWINTRINSIC{MoveMask}(tmp_simd_lo) +// tmp_simd_hi = GT_HWINTRINSIC{GetUpper}(tmp_simd_var) +// hi_result = GT_HWINTRINSIC{MoveMask}(tmp_simd_hi) +// return: GT_LONG(lo_result, hi_result) +// +// Noting that for all types except byte/sbyte, hi_result will be exclusively +// zero and so we can actually optimize this a bit more directly +// +// Arguments: +// use - the LIR::Use object for the def that needs to be decomposed. +// node - the hwintrinsic node to decompose +// +// Return Value: +// The next node to process. +// +GenTree* DecomposeLongs::DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node) +{ + assert(node == use.Def()); + assert(varTypeIsLong(node)); + assert(node->GetHWIntrinsicId() == NI_EVEX_MoveMask); + + GenTree* op1 = node->Op(1); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + assert(varTypeIsArithmetic(simdBaseType)); + assert(op1->TypeGet() == TYP_MASK); + assert(simdSize == 64); + + GenTree* loResult = nullptr; + GenTree* hiResult = nullptr; + + if (varTypeIsByte(simdBaseType)) + { + // Create: + // simdTmpVar = op1 + + GenTree* simdTmpVar = RepresentOpAsLocalVar(op1, node, &node->Op(1)); + unsigned simdTmpVarNum = simdTmpVar->AsLclVarCommon()->GetLclNum(); + JITDUMP("[DecomposeHWIntrinsicMoveMask]: Saving op1 tree to a temp var:\n"); + DISPTREERANGE(Range(), simdTmpVar); + Range().Remove(simdTmpVar); + + Range().InsertBefore(node, simdTmpVar); + + // Create: + // loResult = GT_HWINTRINSIC{MoveMask}(simdTmpVar) + + loResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, NI_EVEX_MoveMask, simdBaseJitType, 32); + Range().InsertBefore(node, loResult); + + simdTmpVar = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTmpVar->TypeGet()); + Range().InsertBefore(node, simdTmpVar); + + // Create: + // simdTmpVar = GT_HWINTRINSIC{ShiftRightMask}(simdTmpVar, 32) + // hiResult = GT_HWINTRINSIC{MoveMask}(simdTmpVar) + + GenTree* shiftIcon = m_compiler->gtNewIconNode(32, TYP_INT); + Range().InsertBefore(node, shiftIcon); + + simdTmpVar = m_compiler->gtNewSimdHWIntrinsicNode(TYP_MASK, simdTmpVar, shiftIcon, NI_EVEX_ShiftRightMask, + simdBaseJitType, 64); + Range().InsertBefore(node, simdTmpVar); + + hiResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, simdTmpVar, NI_EVEX_MoveMask, simdBaseJitType, 32); + Range().InsertBefore(node, hiResult); + } + else + { + // Create: + // loResult = GT_HWINTRINSIC{MoveMask}(op1) + + loResult = m_compiler->gtNewSimdHWIntrinsicNode(TYP_INT, op1, NI_EVEX_MoveMask, simdBaseJitType, simdSize); + Range().InsertBefore(node, loResult); + + // Create: + // hiResult = GT_ICON(0) + + hiResult = m_compiler->gtNewZeroConNode(TYP_INT); + Range().InsertBefore(node, hiResult); + } + + // Done with the original tree; remove it. + Range().Remove(node); + + return FinalizeDecomposition(use, loResult, hiResult, hiResult); +} #endif // FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/decomposelongs.h b/src/coreclr/jit/decomposelongs.h index 744061091e42b..02681322a552e 100644 --- a/src/coreclr/jit/decomposelongs.h +++ b/src/coreclr/jit/decomposelongs.h @@ -64,6 +64,7 @@ class DecomposeLongs #ifdef FEATURE_HW_INTRINSICS GenTree* DecomposeHWIntrinsic(LIR::Use& use); GenTree* DecomposeHWIntrinsicGetElement(LIR::Use& use, GenTreeHWIntrinsic* node); + GenTree* DecomposeHWIntrinsicMoveMask(LIR::Use& use, GenTreeHWIntrinsic* node); #endif // FEATURE_HW_INTRINSICS GenTree* OptimizeCastFromDecomposedLong(GenTreeCast* cast, GenTree* nextNode); diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index d4651444bf5e7..7ba8c6615f361 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2511,14 +2511,13 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector128_ExtractMostSignificantBits: + case NI_Vector256_ExtractMostSignificantBits: case NI_Vector512_ExtractMostSignificantBits: { -#if defined(TARGET_X86) - // TODO-XARCH-CQ: It may be beneficial to decompose this operation - break; -#endif // TARGET_X86 + assert(sig->numArgs == 1); - if (IsBaselineVector512IsaSupportedOpportunistically()) + if ((simdSize == 64) || (varTypeIsShort(simdBaseType) && canUseEvexEncoding())) { op1 = impSIMDPopStack(); @@ -2527,14 +2526,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = gtNewSimdCvtVectorToMaskNode(TYP_MASK, op1, simdBaseJitType, simdSize); } retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_EVEX_MoveMask, simdBaseJitType, simdSize); + break; } - break; - } - - case NI_Vector128_ExtractMostSignificantBits: - case NI_Vector256_ExtractMostSignificantBits: - { - assert(sig->numArgs == 1); if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compOpportunisticallyDependsOn(InstructionSet_AVX2))