Skip to content

Commit

Permalink
Allow shuffle and other hwintrinsic that require a constant to stay i…
Browse files Browse the repository at this point in the history
…ntrinsic if the operand becomes constant later (#102827)

* Allow shuffle and other hwintrinsic that require a constant to stay intrinsic if the operand becomes constant later

* Apply formatting patch

* Ensure GT_HWINTRINSIC clone copies the entry point

* Ensure we don't overwrite the entry point for a hwintrinsic that already had it set

* Remove a bad assert check for GTF_IND_NONFAULTING on GT_HWINTRINSIC nodes

* Ensure Arm64 uses ||

* Fix an assert for Arm64

* Make sure sigInfo is passed down

* Ensure the call type is corrected where applicable

* Don't allow rewriting of intrinsics that need GenTreeFieldList to user calls
  • Loading branch information
tannergooding authored and pull[bot] committed Jul 20, 2024
1 parent cbd4ae6 commit 964383a
Show file tree
Hide file tree
Showing 9 changed files with 463 additions and 177 deletions.
57 changes: 24 additions & 33 deletions src/coreclr/jit/assertionprop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3128,6 +3128,10 @@ bool Compiler::optIsProfitableToSubstitute(GenTree* dest, BasicBlock* destBlock,
return false;
}

// For several of the scenarios below we may skip the costing logic
// since we know that the operand is always containable and therefore
// is always cost effective to propagate.

switch (intrinsicId)
{
#if defined(TARGET_ARM64)
Expand All @@ -3144,13 +3148,8 @@ bool Compiler::optIsProfitableToSubstitute(GenTree* dest, BasicBlock* destBlock,
#endif // TARGET_XARCH
{
// We can optimize when the constant is zero, but only
// for non floating-point since +0.0 == -0.0

if (!vecCon->IsZero() || varTypeIsFloating(simdBaseType))
{
return false;
}
break;
// for non floating-point since +0.0 == -0.0.
return vecCon->IsZero() && !varTypeIsFloating(simdBaseType);
}

#if defined(TARGET_ARM64)
Expand All @@ -3160,12 +3159,7 @@ bool Compiler::optIsProfitableToSubstitute(GenTree* dest, BasicBlock* destBlock,
{
// We can optimize when the constant is zero due to a
// specialized encoding for the instruction

if (!vecCon->IsZero())
{
return false;
}
break;
return vecCon->IsZero();
}

case NI_AdvSimd_CompareGreaterThan:
Expand All @@ -3178,44 +3172,41 @@ bool Compiler::optIsProfitableToSubstitute(GenTree* dest, BasicBlock* destBlock,
// We can optimize when the constant is zero, but only
// for signed types, due to a specialized encoding for
// the instruction

if (!vecCon->IsZero() || varTypeIsUnsigned(simdBaseType))
{
return false;
}
break;
return vecCon->IsZero() && !varTypeIsUnsigned(simdBaseType);
}
#endif // TARGET_ARM64

#if defined(TARGET_XARCH)
case NI_SSE2_Insert:
case NI_SSE41_Insert:
case NI_SSE41_X64_Insert:
{
// We can optimize for float when the constant is zero
// due to a specialized encoding for the instruction

if ((simdBaseType != TYP_FLOAT) || !vecCon->IsZero())
{
return false;
}
break;
return (simdBaseType == TYP_FLOAT) && vecCon->IsZero();
}

case NI_AVX512F_CompareEqualMask:
case NI_AVX512F_CompareNotEqualMask:
{
// We can optimize when the constant is zero, but only
// for non floating-point since +0.0 == -0.0

if (!vecCon->IsZero() || varTypeIsFloating(simdBaseType))
{
return false;
}
break;
return vecCon->IsZero() && !varTypeIsFloating(simdBaseType);
}
#endif // TARGET_XARCH

case NI_Vector128_Shuffle:
#if defined(TARGET_XARCH)
case NI_Vector256_Shuffle:
case NI_Vector512_Shuffle:
#elif defined(TARGET_ARM64)
case NI_Vector64_Shuffle:
#endif
{
// The shuffle indices need to be constant so we can preserve
// the node as a hwintrinsic instead of rewriting as a user call.
assert(parent->GetOperandCount() == 2);
return parent->IsUserCall() && (dest == parent->Op(2));
}

default:
{
break;
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -4600,6 +4600,8 @@ class Compiler
bool mustExpand);

#ifdef FEATURE_HW_INTRINSICS
bool IsValidForShuffle(GenTreeVecCon* vecCon, unsigned simdSize, var_types simdBaseType) const;

GenTree* impHWIntrinsic(NamedIntrinsic intrinsic,
CORINFO_CLASS_HANDLE clsHnd,
CORINFO_METHOD_HANDLE method,
Expand Down
77 changes: 74 additions & 3 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7236,8 +7236,6 @@ ExceptionSetFlags GenTree::OperExceptions(Compiler* comp)

if (hwIntrinsicNode->OperIsMemoryLoadOrStore())
{
assert((gtFlags & GTF_IND_NONFAULTING) == 0);

// TODO-CQ: We should use comp->fgAddrCouldBeNull on the address operand
// to determine if this can actually produce an NRE or not

Expand Down Expand Up @@ -18282,6 +18280,79 @@ unsigned GenTreeVecCon::ElementCount(unsigned simdSize, var_types simdBaseType)
{
return simdSize / genTypeSize(simdBaseType);
}

bool Compiler::IsValidForShuffle(GenTreeVecCon* vecCon, unsigned simdSize, var_types simdBaseType) const
{
#if defined(TARGET_XARCH)
size_t elementSize = genTypeSize(simdBaseType);
size_t elementCount = simdSize / elementSize;

if (simdSize == 32)
{
if (!compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
// While we could accelerate some functions on hardware with only AVX support
// it's likely not worth it overall given that IsHardwareAccelerated reports false
return false;
}
else if ((varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI_VL)) ||
(varTypeIsShort(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512BW_VL)))
{
bool crossLane = false;

for (size_t index = 0; index < elementCount; index++)
{
uint64_t value = vecCon->GetIntegralVectorConstElement(index, simdBaseType);

if (value >= elementCount)
{
continue;
}

if (index < (elementCount / 2))
{
if (value >= (elementCount / 2))
{
crossLane = true;
break;
}
}
else if (value < (elementCount / 2))
{
crossLane = true;
break;
}
}

if (crossLane)
{
// TODO-XARCH-CQ: We should emulate cross-lane shuffling for byte/sbyte and short/ushort
return false;
}
}
}
else if (simdSize == 64)
{
if (varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI))
{
// TYP_BYTE, TYP_UBYTE need AVX512VBMI.
return false;
}
}
else
{
assert(simdSize == 16);

if (varTypeIsSmall(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSSE3))
{
// TYP_BYTE, TYP_UBYTE, TYP_SHORT, and TYP_USHORT need SSSE3 to be able to shuffle any operation
return false;
}
}
#endif // TARGET_XARCH

return true;
}
#endif // FEATURE_HW_INTRINSICS*/

//------------------------------------------------------------------------
Expand Down Expand Up @@ -26417,7 +26488,7 @@ GenTreeFieldList* Compiler::gtConvertTableOpToFieldList(GenTree* op, unsigned fi
LclVarDsc* opVarDsc = lvaGetDesc(op->AsLclVar());
unsigned lclNum = lvaGetLclNum(opVarDsc);
unsigned fieldSize = opVarDsc->lvSize() / fieldCount;
var_types fieldType = TYP_SIMD16;
var_types fieldType = Compiler::getSIMDTypeForSize(fieldSize);

GenTreeFieldList* fieldList = new (this, GT_FIELD_LIST) GenTreeFieldList();
int offset = 0;
Expand Down
24 changes: 22 additions & 2 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1291,6 +1291,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
int immUpperBound = 0;
bool hasFullRangeImm = false;
bool useFallback = false;
bool setMethodHandle = false;

getHWIntrinsicImmOps(intrinsic, sig, &immOp1, &immOp2);

Expand All @@ -1307,7 +1308,14 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
if (!CheckHWIntrinsicImmRange(intrinsic, simdBaseJitType, immOp2, mustExpand, immLowerBound, immUpperBound,
false, &useFallback))
{
return useFallback ? impNonConstFallback(intrinsic, retType, simdBaseJitType) : nullptr;
if (useFallback)
{
return impNonConstFallback(intrinsic, retType, simdBaseJitType);
}
else
{
setMethodHandle = true;
}
}
}
#else
Expand All @@ -1331,7 +1339,14 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
if (!CheckHWIntrinsicImmRange(intrinsic, simdBaseJitType, immOp1, mustExpand, immLowerBound, immUpperBound,
hasFullRangeImm, &useFallback))
{
return useFallback ? impNonConstFallback(intrinsic, retType, simdBaseJitType) : nullptr;
if (useFallback)
{
return impNonConstFallback(intrinsic, retType, simdBaseJitType);
}
else
{
setMethodHandle = true;
}
}
}

Expand Down Expand Up @@ -1602,6 +1617,11 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
simdSize, mustExpand);
}

if (setMethodHandle && (retNode != nullptr))
{
retNode->AsHWIntrinsic()->SetMethodHandle(this, method R2RARG(*entryPoint));
}

#if defined(TARGET_ARM64)
if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrinsic))
{
Expand Down
60 changes: 60 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,66 @@ struct HWIntrinsicInfo
return (flags & HW_Flag_PermuteVar2x) != 0;
}
#endif // TARGET_XARCH

#if defined(TARGET_ARM64)
static void GetImmOpsPositions(NamedIntrinsic id, CORINFO_SIG_INFO* sig, int* imm1Pos, int* imm2Pos)
{
switch (id)
{
case NI_AdvSimd_Insert:
case NI_AdvSimd_InsertScalar:
case NI_AdvSimd_LoadAndInsertScalar:
case NI_AdvSimd_LoadAndInsertScalarVector64x2:
case NI_AdvSimd_LoadAndInsertScalarVector64x3:
case NI_AdvSimd_LoadAndInsertScalarVector64x4:
case NI_AdvSimd_Arm64_LoadAndInsertScalar:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
{
assert(sig->numArgs == 3);
*imm1Pos = 1;
break;
}

case NI_AdvSimd_Arm64_InsertSelectedScalar:
{
assert(sig->numArgs == 4);
*imm1Pos = 2;
*imm2Pos = 0;
break;
}

case NI_Sve_SaturatingDecrementBy16BitElementCount:
case NI_Sve_SaturatingDecrementBy32BitElementCount:
case NI_Sve_SaturatingDecrementBy64BitElementCount:
case NI_Sve_SaturatingDecrementBy8BitElementCount:
case NI_Sve_SaturatingIncrementBy16BitElementCount:
case NI_Sve_SaturatingIncrementBy32BitElementCount:
case NI_Sve_SaturatingIncrementBy64BitElementCount:
case NI_Sve_SaturatingIncrementBy8BitElementCount:
case NI_Sve_SaturatingDecrementBy16BitElementCountScalar:
case NI_Sve_SaturatingDecrementBy32BitElementCountScalar:
case NI_Sve_SaturatingDecrementBy64BitElementCountScalar:
case NI_Sve_SaturatingIncrementBy16BitElementCountScalar:
case NI_Sve_SaturatingIncrementBy32BitElementCountScalar:
case NI_Sve_SaturatingIncrementBy64BitElementCountScalar:
{
assert(sig->numArgs == 3);
*imm1Pos = 1;
*imm2Pos = 0;
break;
}

default:
{
assert(sig->numArgs > 0);
*imm1Pos = 0;
break;
}
}
}
#endif // TARGET_ARM64
};

#ifdef TARGET_ARM64
Expand Down
Loading

0 comments on commit 964383a

Please sign in to comment.