Skip to content

Minimally allow Vector.Shuffle to expand for values that become constant by global morph #102676

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3354,10 +3354,13 @@ class Compiler
NamedIntrinsic hwIntrinsicID);
GenTreeHWIntrinsic* gtNewScalarHWIntrinsicNode(
var_types type, GenTree* op1, GenTree* op2, GenTree* op3, NamedIntrinsic hwIntrinsicID);
CorInfoType getBaseJitTypeFromArgIfNeeded(NamedIntrinsic intrinsic,
var_types getRetTypeAndBaseJitTypeFromSig(NamedIntrinsic intrinsic,
CORINFO_CLASS_HANDLE clsHnd,
CORINFO_SIG_INFO* sig,
CorInfoType simdBaseJitType);
CorInfoType* simdBaseJitType);
CorInfoType getBaseJitTypeFromArgIfNeeded(NamedIntrinsic intrinsic,
CORINFO_SIG_INFO* sig,
CorInfoType simdBaseJitType);

#ifdef TARGET_ARM64
GenTreeFieldList* gtConvertTableOpToFieldList(GenTree* op, unsigned fieldCount);
Expand Down Expand Up @@ -3615,6 +3618,9 @@ class Compiler
GenTree* gtFoldExprCall(GenTreeCall* call);
GenTree* gtFoldTypeCompare(GenTree* tree);
GenTree* gtFoldTypeEqualityCall(bool isEq, GenTree* op1, GenTree* op2);
#if defined(FEATURE_HW_INTRINSICS)
GenTree* gtFoldHWIntrinsicCall(GenTreeCall* call, NamedIntrinsic intrinsic);
#endif // FEATURE_HW_INTRINSICS

// Options to control behavior of gtTryRemoveBoxUpstreamEffects
enum BoxRemovalOptions
Expand Down Expand Up @@ -4573,6 +4579,9 @@ class Compiler
bool mustExpand);

#ifdef FEATURE_HW_INTRINSICS
static bool isSupportedBaseType(NamedIntrinsic intrinsic, CorInfoType baseJitType);
bool IsValidForShuffle(GenTreeVecCon* vecCon, unsigned simdSize, var_types simdBaseType) const;

GenTree* impHWIntrinsic(NamedIntrinsic intrinsic,
CORINFO_CLASS_HANDLE clsHnd,
CORINFO_METHOD_HANDLE method,
Expand Down
177 changes: 177 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13684,6 +13684,13 @@ GenTree* Compiler::gtFoldExprCall(GenTreeCall* call)
// Check for a new-style jit intrinsic.
const NamedIntrinsic ni = lookupNamedIntrinsic(call->gtCallMethHnd);

#if defined(FEATURE_HW_INTRINSICS)
if ((ni > NI_HW_INTRINSIC_START) && (ni < NI_SIMD_AS_HWINTRINSIC_END))
{
return gtFoldHWIntrinsicCall(call, ni);
}
#endif // FEATURE_HW_INTRINSICS

switch (ni)
{
case NI_System_Enum_HasFlag:
Expand Down Expand Up @@ -13756,6 +13763,103 @@ GenTree* Compiler::gtFoldTypeEqualityCall(bool isEq, GenTree* op1, GenTree* op2)
return compare;
}

#if defined(FEATURE_HW_INTRINSICS)
//------------------------------------------------------------------------
// gtFoldHWIntrinsicCall: Fold a call to a hardware intrinsic API or return the original call
//
// Arguments:
// call -- the call node to attempt to fold
// intrinsic -- the ID of the intrinsic represented by the call
//
// Returns:
// call if no folding happened.
// An alternative tree if folding happens.
//
GenTree* Compiler::gtFoldHWIntrinsicCall(GenTreeCall* call, NamedIntrinsic intrinsic)
{
assert((intrinsic > NI_HW_INTRINSIC_START) && (intrinsic < NI_SIMD_AS_HWINTRINSIC_END));

if (intrinsic > NI_SIMD_AS_HWINTRINSIC_START)
{
// TODO-CQ: Handle SIMD_AS_HWINTRINSIC
return call;
}

CORINFO_CLASS_HANDLE clsHnd = NO_CLASS_HANDLE;
CORINFO_METHOD_HANDLE method = call->gtCallMethHnd;

CORINFO_SIG_INFO sig;
eeGetMethodSig(method, &sig);

int numArgs = sig.numArgs;
CorInfoType simdBaseJitType = CORINFO_TYPE_UNDEF;
var_types retType = getRetTypeAndBaseJitTypeFromSig(intrinsic, clsHnd, &sig, &simdBaseJitType);
GenTree* retNode = call;

if (retType == TYP_UNKNOWN)
{
return retNode;
}

HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsic);
CORINFO_InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsic);

// Immediately return if the category is other than scalar/special and this is not a supported base type.
if ((category != HW_Category_Special) && (category != HW_Category_Scalar) && !HWIntrinsicInfo::isScalarIsa(isa) &&
!isSupportedBaseType(intrinsic, simdBaseJitType))
{
return retNode;
}

var_types simdBaseType = TYP_UNKNOWN;

if (simdBaseJitType != CORINFO_TYPE_UNDEF)
{
simdBaseType = JitType2PreciseVarType(simdBaseJitType);
assert(varTypeIsArithmetic(simdBaseType));
}

const unsigned simdSize = HWIntrinsicInfo::lookupSimdSize(this, intrinsic, &sig);

switch (intrinsic)
{
case NI_Vector128_Shuffle:
#if defined(TARGET_XARCH)
case NI_Vector256_Shuffle:
case NI_Vector512_Shuffle:
#elif defined(TARGET_ARM64)
case NI_Vector64_Shuffle:
#endif
{
GenTree* op2 = call->gtArgs.GetUserArgByIndex(1)->GetNode();

if (!op2->IsVectorConst() || !IsValidForShuffle(op2->AsVecCon(), simdSize, simdBaseType))
{
// TODO-CQ: Handling non-constant indices is a bit more complex
break;
}

GenTree* op1 = call->gtArgs.GetUserArgByIndex(0)->GetNode();
retNode = gtNewSimdShuffleNode(retType, op1, op2, simdBaseJitType, simdSize);

if (call->gtArgs.HasRetBuffer())
{
GenTree* retBuf = call->gtArgs.GetRetBufferArg()->GetNode();
retNode = gtNewStoreIndNode(retType, retBuf, retNode);
}
break;
}

default:
{
break;
}
}

return retNode;
}
#endif // FEATURE_HW_INTRINSICS

/*****************************************************************************
*
* Some comparisons can be folded:
Expand Down Expand Up @@ -18269,6 +18373,79 @@ unsigned GenTreeVecCon::ElementCount(unsigned simdSize, var_types simdBaseType)
{
return simdSize / genTypeSize(simdBaseType);
}

bool Compiler::IsValidForShuffle(GenTreeVecCon* vecCon, unsigned simdSize, var_types simdBaseType) const
{
#if defined(TARGET_XARCH)
size_t elementSize = genTypeSize(simdBaseType);
size_t elementCount = simdSize / elementSize;

if (simdSize == 32)
{
if (!compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
// While we could accelerate some functions on hardware with only AVX support
// it's likely not worth it overall given that IsHardwareAccelerated reports false
return false;
}
else if ((varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL))
|| (varTypeIsShort(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512BW_VL)))
{
bool crossLane = false;

for (size_t index = 0; index < elementCount; index++)
{
uint64_t value = vecCon->GetIntegralVectorConstElement(index, simdBaseType);

if (value >= elementCount)
{
continue;
}

if (index < (elementCount / 2))
{
if (value >= (elementCount / 2))
{
crossLane = true;
break;
}
}
else if (value < (elementCount / 2))
{
crossLane = true;
break;
}
}

if (crossLane)
{
// TODO-XARCH-CQ: We should emulate cross-lane shuffling for byte/sbyte and short/ushort
return false;
}
}
}
else if (simdSize == 64)
{
if (varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI))
{
// TYP_BYTE, TYP_UBYTE need AVX512VBMI.
return false;
}
}
else
{
assert(simdSize == 16);

if (varTypeIsSmall(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSSE3))
{
// TYP_BYTE, TYP_UBYTE, TYP_SHORT, and TYP_USHORT need SSSE3 to be able to shuffle any operation
return false;
}
}
#endif // TARGET_XARCH

return true;
}
#endif // FEATURE_HW_INTRINSICS*/

//------------------------------------------------------------------------
Expand Down
Loading
Loading