Skip to content
This repository was archived by the owner on Jan 23, 2023. It is now read-only.

Commit 2d650dc

Browse files
committed
Implement SetAllVector256
1 parent 04d9414 commit 2d650dc

7 files changed

+425
-8
lines changed

src/jit/hwintrinsiccodegenxarch.cpp

+61
Original file line numberDiff line numberDiff line change
@@ -1311,6 +1311,67 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
13111311
break;
13121312
}
13131313

1314+
case NI_AVX_SetAllVector256:
1315+
{
1316+
assert(op1 != nullptr);
1317+
assert(op2 == nullptr);
1318+
op1Reg = op1->gtRegNum;
1319+
if (varTypeIsIntegral(baseType))
1320+
{
1321+
// If the argument is a integer, it needs to be moved into a XMM register
1322+
regNumber tmpXMM = node->ExtractTempReg();
1323+
emit->emitIns_R_R(INS_mov_i2xmm, emitActualTypeSize(baseType), tmpXMM, op1Reg);
1324+
op1Reg = tmpXMM;
1325+
}
1326+
1327+
if (compiler->compSupports(InstructionSet_AVX2))
1328+
{
1329+
// generate broadcast instructions if AVX2 is available
1330+
emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD32), targetReg, op1Reg);
1331+
}
1332+
else
1333+
{
1334+
// duplicate the scalar argument to XMM register
1335+
switch (baseType)
1336+
{
1337+
case TYP_FLOAT:
1338+
emit->emitIns_SIMD_R_R_I(INS_vpermilps, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1339+
break;
1340+
case TYP_DOUBLE:
1341+
emit->emitIns_R_R(INS_movddup, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg);
1342+
break;
1343+
case TYP_BYTE:
1344+
case TYP_UBYTE:
1345+
{
1346+
regNumber tmpZeroReg = node->GetSingleTempReg();
1347+
emit->emitIns_R_R(INS_pxor, emitTypeSize(TYP_SIMD16), tmpZeroReg, tmpZeroReg);
1348+
emit->emitIns_SIMD_R_R_R(INS_pshufb, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, tmpZeroReg);
1349+
break;
1350+
}
1351+
case TYP_SHORT:
1352+
case TYP_USHORT:
1353+
emit->emitIns_SIMD_R_R_I(INS_pshuflw, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1354+
emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 80);
1355+
break;
1356+
case TYP_INT:
1357+
case TYP_UINT:
1358+
emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 0);
1359+
break;
1360+
case TYP_LONG:
1361+
case TYP_ULONG:
1362+
emit->emitIns_SIMD_R_R_I(INS_pshufd, emitTypeSize(TYP_SIMD16), op1Reg, op1Reg, 68);
1363+
break;
1364+
1365+
default:
1366+
unreached();
1367+
break;
1368+
}
1369+
// duplicate the XMM register to YMM register
1370+
emit->emitIns_SIMD_R_R_R_I(INS_vinsertf128, emitTypeSize(TYP_SIMD32), targetReg, op1Reg, op1Reg, 1);
1371+
}
1372+
break;
1373+
}
1374+
13141375
case NI_AVX_ExtendToVector256:
13151376
{
13161377
// ExtendToVector256 has zero-extend semantics in order to ensure it is deterministic

src/jit/hwintrinsiclistxarch.h

+4
Original file line numberDiff line numberDiff line change
@@ -347,12 +347,14 @@ HARDWARE_INTRINSIC(AVX_Divide, "Divide",
347347
HARDWARE_INTRINSIC(AVX_DotProduct, "DotProduct", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM)
348348
HARDWARE_INTRINSIC(AVX_DuplicateEvenIndexed, "DuplicateEvenIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
349349
HARDWARE_INTRINSIC(AVX_DuplicateOddIndexed, "DuplicateOddIndexed", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
350+
HARDWARE_INTRINSIC(AVX_Extract, "Extract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_BaseTypeFromFirstArg|HW_Flag_FullRangeIMM|HW_Flag_NoCodeGen)
350351
HARDWARE_INTRINSIC(AVX_ExtendToVector256, "ExtendToVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
351352
HARDWARE_INTRINSIC(AVX_ExtractVector128, "ExtractVector128", AVX, -1, 32, -1, {INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128,INS_vextractf128, INS_vextractf128},HW_Category_IMM, HW_Flag_OneTypeGeneric|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_FullRangeIMM)
352353
HARDWARE_INTRINSIC(AVX_Floor, "Floor", AVX, 9, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
353354
HARDWARE_INTRINSIC(AVX_GetLowerHalf, "GetLowerHalf", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
354355
HARDWARE_INTRINSIC(AVX_HorizontalAdd, "HorizontalAdd", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
355356
HARDWARE_INTRINSIC(AVX_HorizontalSubtract, "HorizontalSubtract", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag)
357+
HARDWARE_INTRINSIC(AVX_Insert, "Insert", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoCodeGen)
356358
HARDWARE_INTRINSIC(AVX_InsertVector128, "InsertVector128", AVX, -1, 32, 3, {INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128,INS_vinsertf128, INS_vinsertf128},HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_OneTypeGeneric|HW_Flag_SpecialCodeGen)
357359
HARDWARE_INTRINSIC(AVX_LoadAlignedVector256, "LoadAlignedVector256", AVX, -1, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
358360
HARDWARE_INTRINSIC(AVX_LoadDquVector256, "LoadDquVector256", AVX, -1, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics)
@@ -369,6 +371,8 @@ HARDWARE_INTRINSIC(AVX_RoundToNearestInteger, "RoundToNea
369371
HARDWARE_INTRINSIC(AVX_RoundToNegativeInfinity, "RoundToNegativeInfinity", AVX, 9, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
370372
HARDWARE_INTRINSIC(AVX_RoundToPositiveInfinity, "RoundToPositiveInfinity", AVX, 10, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
371373
HARDWARE_INTRINSIC(AVX_RoundToZero, "RoundToZero", AVX, 11, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)
374+
HARDWARE_INTRINSIC(AVX_SetVector256, "SetVector256", AVX, -1, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SecondArgMaybe64Bit)
375+
HARDWARE_INTRINSIC(AVX_SetAllVector256, "SetAllVector256", AVX, -1, 32, 1, {INS_vpbroadcastb,INS_vpbroadcastb,INS_vpbroadcastw,INS_vpbroadcastw,INS_vpbroadcastd,INS_vpbroadcastd,INS_vpbroadcastq,INS_vpbroadcastq,INS_vbroadcastss,INS_vbroadcastsd},HW_Category_Helper, HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_OneTypeGeneric)
372376
HARDWARE_INTRINSIC(AVX_SetZeroVector256, "SetZeroVector256", AVX, -1, 32, 0, {INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_pxor, INS_xorps, INS_xorpd}, HW_Category_Helper, HW_Flag_OneTypeGeneric|HW_Flag_NoRMWSemantics)
373377
HARDWARE_INTRINSIC(AVX_Shuffle, "Shuffle", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM)
374378
HARDWARE_INTRINSIC(AVX_Sqrt, "Sqrt", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics)

src/jit/hwintrinsicxarch.cpp

+40-8
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,25 @@ GenTree* Compiler::impSSE42Intrinsic(NamedIntrinsic intrinsic,
10581058
return retNode;
10591059
}
10601060

1061+
//------------------------------------------------------------------------
1062+
// gethalfAndNormalizedIndex: compute the middle index of a Vector256<baseType>
1063+
// and normalize the index to the specific range
1064+
//
1065+
// Arguments:
1066+
// indexPtr -- the pointer to the original index value
1067+
// baseType -- the base type of the Vector256<T>
1068+
//
1069+
// Return Value:
1070+
// the middle index of a Vector256<baseType> and normalized index
1071+
//
1072+
static int getMidAndNormalizedIndex(int* indexPtr, var_types baseType)
1073+
{
1074+
assert(varTypeIsArithmetic(baseType));
1075+
// clear the unused bits to normalize the index into the range of [0, length of Vector256<baseType>)
1076+
*indexPtr = (*indexPtr) & (32 / genTypeSize(baseType) - 1);
1077+
return (16 / genTypeSize(baseType));
1078+
}
1079+
10611080
GenTree* Compiler::impAvxOrAvx2Intrinsic(NamedIntrinsic intrinsic,
10621081
CORINFO_METHOD_HANDLE method,
10631082
CORINFO_SIG_INFO* sig,
@@ -1084,16 +1103,15 @@ GenTree* Compiler::impAvxOrAvx2Intrinsic(NamedIntrinsic intrinsic,
10841103
var_types retType = JITtype2varType(sig->retType);
10851104
assert(varTypeIsArithmetic(baseType));
10861105

1087-
ival = ival & (32 / genTypeSize(baseType) - 1); // clear the unused bits
1088-
int halfIndex = 16 / genTypeSize(baseType);
1106+
int midIndex = getMidAndNormalizedIndex(&ival, baseType);
10891107
NamedIntrinsic extractIntrinsic = varTypeIsShort(baseType) ? NI_SSE2_Extract : NI_SSE41_Extract;
10901108
GenTree* half = nullptr;
10911109

1092-
if (ival >= halfIndex)
1110+
if (ival >= midIndex)
10931111
{
10941112
half = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(1), NI_AVX_ExtractVector128,
10951113
baseType, 32);
1096-
ival -= halfIndex;
1114+
ival -= midIndex;
10971115
}
10981116
else
10991117
{
@@ -1117,21 +1135,20 @@ GenTree* Compiler::impAvxOrAvx2Intrinsic(NamedIntrinsic intrinsic,
11171135
baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
11181136
assert(varTypeIsArithmetic(baseType));
11191137

1120-
ival = ival & (32 / genTypeSize(baseType) - 1); // clear the unused bits
1121-
int halfIndex = 16 / genTypeSize(baseType);
1138+
int midIndex = getMidAndNormalizedIndex(&ival, baseType);
11221139
NamedIntrinsic insertIntrinsic = varTypeIsShort(baseType) ? NI_SSE2_Insert : NI_SSE41_Insert;
11231140

11241141
GenTree* clonedVectorOp;
11251142
vectorOp =
11261143
impCloneExpr(vectorOp, &clonedVectorOp, info.compCompHnd->getArgClass(sig, sig->args),
11271144
(unsigned)CHECK_SPILL_ALL, nullptr DEBUGARG("AVX Insert clones the vector operand"));
11281145

1129-
if (ival >= halfIndex)
1146+
if (ival >= midIndex)
11301147
{
11311148
GenTree* halfVector = gtNewSimdHWIntrinsicNode(TYP_SIMD16, vectorOp, gtNewIconNode(1),
11321149
NI_AVX_ExtractVector128, baseType, 32);
11331150
GenTree* ModifiedHalfVector =
1134-
gtNewSimdHWIntrinsicNode(TYP_SIMD16, halfVector, dataOp, gtNewIconNode(ival - halfIndex),
1151+
gtNewSimdHWIntrinsicNode(TYP_SIMD16, halfVector, dataOp, gtNewIconNode(ival - midIndex),
11351152
insertIntrinsic, baseType, 16);
11361153
retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD32, clonedVectorOp, ModifiedHalfVector, gtNewIconNode(1),
11371154
NI_AVX_InsertVector128, baseType, 32);
@@ -1197,6 +1214,21 @@ GenTree* Compiler::impAvxOrAvx2Intrinsic(NamedIntrinsic intrinsic,
11971214
break;
11981215
}
11991216

1217+
case NI_AVX_SetAllVector256:
1218+
{
1219+
GenTree* arg = impPopStack().val;
1220+
baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
1221+
#ifdef _TARGET_X86_
1222+
// TODO-XARCH: support long/ulong on 32-bit platfroms
1223+
if (varTypeIsLong(baseType))
1224+
{
1225+
return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
1226+
}
1227+
#endif
1228+
retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD32, arg, NI_AVX_SetAllVector256, baseType, 32);
1229+
break;
1230+
}
1231+
12001232
case NI_AVX_ExtractVector128:
12011233
case NI_AVX2_ExtractVector128:
12021234
{

src/jit/lsraxarch.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -2368,6 +2368,20 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
23682368
break;
23692369
}
23702370

2371+
case NI_AVX_SetAllVector256:
2372+
{
2373+
if (varTypeIsIntegral(baseType))
2374+
{
2375+
info->internalFloatCount = 1;
2376+
if (!compiler->compSupports(InstructionSet_AVX2) && varTypeIsByte(baseType))
2377+
{
2378+
info->internalFloatCount += 1;
2379+
}
2380+
info->setInternalCandidates(this, allSIMDRegs());
2381+
}
2382+
break;
2383+
}
2384+
23712385
case NI_SSE2_MaskMove:
23722386
{
23732387
// SSE2 MaskMove hardcodes the destination (op3) in DI/EDI/RDI

0 commit comments

Comments
 (0)