Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32593,6 +32593,78 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree)
{
switch (ni)
{
#if defined(TARGET_ARM64)
case NI_Vector64_ExtractMostSignificantBits:
#elif defined(TARGET_XARCH)
case NI_Vector256_ExtractMostSignificantBits:
case NI_X86Base_MoveMask:
case NI_AVX_MoveMask:
case NI_AVX2_MoveMask:
#endif
case NI_Vector128_ExtractMostSignificantBits:
{
simdmask_t simdMaskVal;

switch (simdSize)
{
case 8:
{
EvaluateExtractMSB<simd8_t>(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd8Val);
break;
}

case 16:
{
EvaluateExtractMSB<simd16_t>(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd16Val);
break;
}

#if defined(TARGET_XARCH)
case 32:
{
EvaluateExtractMSB<simd32_t>(simdBaseType, &simdMaskVal, cnsNode->AsVecCon()->gtSimd32Val);
break;
}
#endif // TARGET_XARCH

default:
{
unreached();
}
}

uint32_t elemCount = simdSize / genTypeSize(simdBaseType);
uint64_t mask = simdMaskVal.GetRawBits() & simdmask_t::GetBitMask(elemCount);

assert(varTypeIsInt(retType));
assert(elemCount <= 32);

resultNode = gtNewIconNode(static_cast<int32_t>(mask));
break;
}

#ifdef TARGET_XARCH
case NI_AVX512_MoveMask:
{
GenTreeMskCon* mskCns = cnsNode->AsMskCon();

uint32_t elemCount = simdSize / genTypeSize(simdBaseType);
uint64_t mask = mskCns->gtSimdMaskVal.GetRawBits() & simdmask_t::GetBitMask(elemCount);

if (varTypeIsInt(retType))
{
assert(elemCount <= 32);
resultNode = gtNewIconNode(static_cast<int32_t>(mask));
}
else
{
assert(varTypeIsLong(retType));
resultNode = gtNewLconNode(static_cast<int64_t>(mask));
}
break;
}
#endif // TARGET_XARCH

#ifdef TARGET_ARM64
case NI_ArmBase_LeadingZeroCount:
#else
Expand Down
162 changes: 2 additions & 160 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1346,166 +1346,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
case NI_Vector128_ExtractMostSignificantBits:
{
assert(sig->numArgs == 1);

// ARM64 doesn't have a single instruction that performs the behavior so we'll emulate it instead.
// To do this, we effectively perform the following steps:
// 1. tmp = input & 0x80 ; and the input to clear all but the most significant bit
// 2. tmp = tmp >> index ; right shift each element by its index
// 3. tmp = sum(tmp) ; sum the elements together

// For byte/sbyte, we also need to handle the fact that we can only shift by up to 8
// but for Vector128, we have 16 elements to handle. In that scenario, we will simply
// extract both scalars, and combine them via: (upper << 8) | lower

var_types simdType = getSIMDTypeForSize(simdSize);

op1 = impSIMDPopStack();

GenTreeVecCon* vecCon2 = gtNewVconNode(simdType);
GenTreeVecCon* vecCon3 = gtNewVconNode(simdType);

switch (simdBaseType)
{
case TYP_BYTE:
case TYP_UBYTE:
{
simdBaseType = TYP_UBYTE;
simdBaseJitType = CORINFO_TYPE_UBYTE;

vecCon2->gtSimdVal.u64[0] = 0x8080808080808080;
vecCon3->gtSimdVal.u64[0] = 0x00FFFEFDFCFBFAF9;

if (simdSize == 16)
{
vecCon2->gtSimdVal.u64[1] = 0x8080808080808080;
vecCon3->gtSimdVal.u64[1] = 0x00FFFEFDFCFBFAF9;
}
break;
}

case TYP_SHORT:
case TYP_USHORT:
{
simdBaseType = TYP_USHORT;
simdBaseJitType = CORINFO_TYPE_USHORT;

vecCon2->gtSimdVal.u64[0] = 0x8000800080008000;
vecCon3->gtSimdVal.u64[0] = 0xFFF4FFF3FFF2FFF1;

if (simdSize == 16)
{
vecCon2->gtSimdVal.u64[1] = 0x8000800080008000;
vecCon3->gtSimdVal.u64[1] = 0xFFF8FFF7FFF6FFF5;
}
break;
}

case TYP_INT:
case TYP_UINT:
case TYP_FLOAT:
{
simdBaseType = TYP_INT;
simdBaseJitType = CORINFO_TYPE_INT;

vecCon2->gtSimdVal.u64[0] = 0x8000000080000000;
vecCon3->gtSimdVal.u64[0] = 0xFFFFFFE2FFFFFFE1;

if (simdSize == 16)
{
vecCon2->gtSimdVal.u64[1] = 0x8000000080000000;
vecCon3->gtSimdVal.u64[1] = 0xFFFFFFE4FFFFFFE3;
}
break;
}

case TYP_LONG:
case TYP_ULONG:
case TYP_DOUBLE:
{
simdBaseType = TYP_LONG;
simdBaseJitType = CORINFO_TYPE_LONG;

vecCon2->gtSimdVal.u64[0] = 0x8000000000000000;
vecCon3->gtSimdVal.u64[0] = 0xFFFFFFFFFFFFFFC1;

if (simdSize == 16)
{
vecCon2->gtSimdVal.u64[1] = 0x8000000000000000;
vecCon3->gtSimdVal.u64[1] = 0xFFFFFFFFFFFFFFC2;
}
break;
}

default:
{
unreached();
}
}

op3 = vecCon3;
op2 = vecCon2;
op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op2, NI_AdvSimd_And, simdBaseJitType, simdSize);

NamedIntrinsic shiftIntrinsic = NI_AdvSimd_ShiftLogical;

if ((simdSize == 8) && varTypeIsLong(simdBaseType))
{
shiftIntrinsic = NI_AdvSimd_ShiftLogicalScalar;
}

op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op3, shiftIntrinsic, simdBaseJitType, simdSize);

if (varTypeIsByte(simdBaseType) && (simdSize == 16))
{
op1 = impCloneExpr(op1, &op2, CHECK_SPILL_ALL,
nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits"));

op1 = gtNewSimdGetLowerNode(TYP_SIMD8, op1, simdBaseJitType, simdSize);
op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 8);
op1 = gtNewSimdToScalarNode(genActualType(simdBaseType), op1, simdBaseJitType, 8);
op1 = gtNewCastNode(TYP_INT, op1, /* isUnsigned */ true, TYP_INT);

GenTree* zero = gtNewZeroConNode(TYP_SIMD16);
ssize_t index = 8 / genTypeSize(simdBaseType);

op2 = gtNewSimdGetUpperNode(TYP_SIMD8, op2, simdBaseJitType, simdSize);
op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op2, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 8);
op2 = gtNewSimdToScalarNode(genActualType(simdBaseType), op2, simdBaseJitType, 8);
op2 = gtNewCastNode(TYP_INT, op2, /* isUnsigned */ true, TYP_INT);

op2 = gtNewOperNode(GT_LSH, TYP_INT, op2, gtNewIconNode(8));
retNode = gtNewOperNode(GT_OR, TYP_INT, op1, op2);
}
else
{
if (!varTypeIsLong(simdBaseType))
{
if ((simdSize == 8) && ((simdBaseType == TYP_INT) || (simdBaseType == TYP_UINT)))
{
op1 = impCloneExpr(op1, &op2, CHECK_SPILL_ALL,
nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits"));
op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_AddPairwise, simdBaseJitType,
simdSize);
}
else
{
op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType,
simdSize);
}
}
else if (simdSize == 16)
{
op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddPairwiseScalar, simdBaseJitType,
simdSize);
}

retNode = gtNewSimdToScalarNode(genActualType(simdBaseType), op1, simdBaseJitType, 8);

if ((simdBaseType != TYP_INT) && (simdBaseType != TYP_UINT))
{
retNode = gtNewCastNode(TYP_INT, retNode, /* isUnsigned */ true, TYP_INT);
}
}
op1 = impSIMDPopStack();
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize);
break;
}

Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/hwintrinsiclistarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ HARDWARE_INTRINSIC(Vector64, CreateSequence,
HARDWARE_INTRINSIC(Vector64, Dot, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector64, Equals, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, EqualsAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector64, ExtractMostSignificantBits, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector64, ExtractMostSignificantBits, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector64, Floor, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, FusedMultiplyAdd, 8, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, GetElement, 8, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SupportsContainment)
Expand Down Expand Up @@ -182,7 +182,7 @@ HARDWARE_INTRINSIC(Vector128, CreateSequence,
HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector128, Equals, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, FusedMultiplyAdd, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SupportsContainment)
Expand Down
Loading
Loading