Skip to content

Commit d87cec8

Browse files
committed
Address some feedback & a bug fix
- Implement a bunch of feedback - Fix an oversight in `IsValidForShuffle` where we don't treat something as variable indices that gets emitted as such
1 parent 6aed6f0 commit d87cec8

File tree

5 files changed

+119
-87
lines changed

5 files changed

+119
-87
lines changed

src/coreclr/jit/compiler.h

+6-2
Original file line numberDiff line numberDiff line change
@@ -3345,7 +3345,7 @@ class Compiler
33453345
GenTree* gtNewSimdRoundNode(
33463346
var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize);
33473347

3348-
GenTree* gtNewSimdShuffleNodeVariable(var_types type,
3348+
GenTree* gtNewSimdShuffleVariableNode(var_types type,
33493349
GenTree* op1,
33503350
GenTree* op2,
33513351
CorInfoType simdBaseJitType,
@@ -4682,7 +4682,11 @@ class Compiler
46824682
bool mustExpand);
46834683

46844684
#ifdef FEATURE_HW_INTRINSICS
4685-
bool IsValidForShuffle(GenTree* indices, unsigned simdSize, var_types simdBaseType, bool* canBecomeValid) const;
4685+
bool IsValidForShuffle(GenTree* indices,
4686+
unsigned simdSize,
4687+
var_types simdBaseType,
4688+
bool* canBecomeValid,
4689+
bool isShuffleNative) const;
46864690

46874691
GenTree* impHWIntrinsic(NamedIntrinsic intrinsic,
46884692
CORINFO_CLASS_HANDLE clsHnd,

src/coreclr/jit/gentree.cpp

+95-42
Original file line numberDiff line numberDiff line change
@@ -18393,10 +18393,11 @@ unsigned GenTreeVecCon::ElementCount(unsigned simdSize, var_types simdBaseType)
1839318393
bool Compiler::IsValidForShuffle(GenTree* indices,
1839418394
unsigned simdSize,
1839518395
var_types simdBaseType,
18396-
bool* canBecomeValid) const
18396+
bool* canBecomeValid,
18397+
bool isShuffleNative) const
1839718398
{
1839818399
#if defined(TARGET_XARCH)
18399-
if (canBecomeValid)
18400+
if (canBecomeValid != nullptr)
1840018401
{
1840118402
*canBecomeValid = false;
1840218403
}
@@ -18414,7 +18415,7 @@ bool Compiler::IsValidForShuffle(GenTree* indices,
1841418415
}
1841518416
else if (simdSize == 64)
1841618417
{
18417-
if (varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI))
18418+
if (varTypeIsByte(simdBaseType) && (!compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI)))
1841818419
{
1841918420
// TYP_BYTE, TYP_UBYTE need AVX512VBMI.
1842018421
return false;
@@ -18424,13 +18425,27 @@ bool Compiler::IsValidForShuffle(GenTree* indices,
1842418425
{
1842518426
assert(simdSize == 16);
1842618427

18427-
if (varTypeIsSmall(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSSE3))
18428+
if (varTypeIsSmall(simdBaseType) && (!compOpportunisticallyDependsOn(InstructionSet_SSSE3)))
1842818429
{
1842918430
// TYP_BYTE, TYP_UBYTE, TYP_SHORT, and TYP_USHORT need SSSE3 to be able to shuffle any operation
1843018431
return false;
1843118432
}
1843218433

18433-
if (!indices->IsCnsVec() && !compOpportunisticallyDependsOn(InstructionSet_SSSE3))
18434+
bool isVariableShuffle = !indices->IsCnsVec();
18435+
if ((!isVariableShuffle) && isShuffleNative)
18436+
{
18437+
// ShuffleNative with constant indices with 1 or more out of range indices is emitted as variable indices.
18438+
for (size_t index = 0; index < elementCount; index++)
18439+
{
18440+
uint64_t value = op2->GetIntegralVectorConstElement(index, simdBaseType);
18441+
if (value >= elementCount)
18442+
{
18443+
isVariableShuffle = true;
18444+
break;
18445+
}
18446+
}
18447+
}
18448+
if (isVariableShuffle && (!compOpportunisticallyDependsOn(InstructionSet_SSSE3)))
1843418449
{
1843518450
// the variable implementation for Vector128 Shuffle always needs SSSE3
1843618451
// however, this can become valid later if it becomes constant
@@ -18443,7 +18458,7 @@ bool Compiler::IsValidForShuffle(GenTree* indices,
1844318458
}
1844418459
#endif // TARGET_XARCH
1844518460

18446-
if (canBecomeValid)
18461+
if (canBecomeValid != nullptr)
1844718462
{
1844818463
*canBecomeValid = true;
1844918464
}
@@ -25391,7 +25406,23 @@ GenTree* Compiler::gtNewSimdRoundNode(var_types type, GenTree* op1, CorInfoType
2539125406
return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize);
2539225407
}
2539325408

25394-
GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25409+
//------------------------------------------------------------------------
25410+
// gtNewSimdShuffleVariableNode: Creates a new simd shuffle node (with variable indices, or a case isn't handled in
25411+
// gtNewSimdShuffleNode for ShuffleUnsafe with out of bounds indices) - this is a helper function for
25412+
// gtNewSimdShuffleNode & should just be invoked by it indirectly, instead of other callers using it
25413+
//
25414+
// Arguments:
25415+
// type -- The type of the node
25416+
// op1 -- The values to shuffle
25417+
// op2 -- The indices to pick from (variable)
25418+
// simdBaseJitType -- The base jit type of the node
25419+
// simdSize -- The simd size of the node
25420+
// isShuffleNative -- Whether we're making a ShuffleNative node vs a Shuffle one
25421+
//
25422+
// Return Value:
25423+
// The shuffle node
25424+
//
25425+
GenTree* Compiler::gtNewSimdShuffleVariableNode(
2539525426
var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isShuffleNative)
2539625427
{
2539725428
assert(IsBaselineSimdIsaSupportedDebugOnly());
@@ -25405,7 +25436,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2540525436
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
2540625437
assert(op2 != nullptr);
2540725438
assert(op2->TypeIs(type));
25408-
assert(!op2->IsCnsVec() || isShuffleNative);
25439+
assert((!op2->IsCnsVec()) || isShuffleNative);
2540925440

2541025441
GenTree* retNode = nullptr;
2541125442
GenTree* cnsNode = nullptr;
@@ -25419,7 +25450,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2541925450
#if defined(TARGET_XARCH)
2542025451
if (!isShuffleNative)
2542125452
#elif defined(TARGET_ARM64)
25422-
if (!isShuffleNative && elementSize > 1)
25453+
if ((!isShuffleNative) && (elementSize > 1))
2542325454
#else
2542425455
#error Unsupported platform
2542525456
#endif // !TARGET_XARCH && !TARGET_ARM64
@@ -25474,7 +25505,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2547425505
retNode->SetReverseOp();
2547525506
}
2547625507
}
25477-
else if (elementSize == 1 && simdSize == 16)
25508+
else if ((elementSize == 1) && (simdSize == 16))
2547825509
{
2547925510
assert(compIsaSupportedDebugOnly(InstructionSet_SSSE3));
2548025511

@@ -25483,7 +25514,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2548325514
// high bit on index gives 0 already
2548425515
canUseSignedComparisonHint = true;
2548525516
}
25486-
else if (elementSize == 1 && simdSize == 32 &&
25517+
else if ((elementSize == 1) && (simdSize == 32) &&
2548725518
compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512VBMI_VL))
2548825519
{
2548925520
NamedIntrinsic intrinsic = isV512Supported ? NI_AVX512VBMI_VL_PermuteVar32x8 : NI_AVX10v1_PermuteVar32x8;
@@ -25492,26 +25523,26 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2549225523
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, intrinsic, simdBaseJitType, simdSize);
2549325524
retNode->SetReverseOp();
2549425525
}
25495-
else if (elementSize == 2 && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL))
25526+
else if ((elementSize == 2) && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL))
2549625527
{
25497-
assert(simdSize == 16 || simdSize == 32);
25528+
assert((simdSize == 16) || (simdSize == 32));
2549825529
NamedIntrinsic intrinsic;
2549925530
if (isV512Supported)
2550025531
{
25501-
intrinsic = simdSize == 16 ? NI_AVX512BW_VL_PermuteVar8x16 : NI_AVX512BW_VL_PermuteVar16x16;
25532+
intrinsic = (simdSize == 16) ? NI_AVX512BW_VL_PermuteVar8x16 : NI_AVX512BW_VL_PermuteVar16x16;
2550225533
}
2550325534
else
2550425535
{
25505-
intrinsic = simdSize == 16 ? NI_AVX10v1_PermuteVar8x16 : NI_AVX10v1_PermuteVar16x16;
25536+
intrinsic = (simdSize == 16) ? NI_AVX10v1_PermuteVar8x16 : NI_AVX10v1_PermuteVar16x16;
2550625537
}
2550725538

2550825539
// swap the operands to match the encoding requirements
2550925540
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, intrinsic, simdBaseJitType, simdSize);
2551025541
retNode->SetReverseOp();
2551125542
}
25512-
else if (elementSize == 4 && (simdSize == 32 || compOpportunisticallyDependsOn(InstructionSet_AVX)))
25543+
else if ((elementSize == 4) && ((simdSize == 32) || compOpportunisticallyDependsOn(InstructionSet_AVX)))
2551325544
{
25514-
assert(simdSize == 16 || simdSize == 32);
25545+
assert((simdSize == 16) || (simdSize == 32));
2551525546

2551625547
if (simdSize == 32)
2551725548
{
@@ -25528,7 +25559,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2552825559
retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX_PermuteVar, CORINFO_TYPE_FLOAT, simdSize);
2552925560
}
2553025561
}
25531-
else if (elementSize == 8 && simdSize == 32 &&
25562+
else if ((elementSize == 8) && (simdSize == 32) &&
2553225563
compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512F_VL))
2553325564
{
2553425565
NamedIntrinsic intrinsic = isV512Supported ? NI_AVX512F_VL_PermuteVar4x64 : NI_AVX10v1_PermuteVar4x64;
@@ -25537,7 +25568,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2553725568
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, intrinsic, simdBaseJitType, simdSize);
2553825569
retNode->SetReverseOp();
2553925570
}
25540-
else if (elementSize == 8 && simdSize == 16 &&
25571+
else if ((elementSize == 8) && (simdSize == 16) &&
2554125572
compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512F_VL))
2554225573
{
2554325574
GenTree* op1Copy = fgMakeMultiUse(&op1); // just use op1 again for the other variable
@@ -25546,12 +25577,12 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2554625577
}
2554725578
else
2554825579
{
25549-
assert((elementSize == 1 && simdSize == 32) || elementSize == 2 || (elementSize == 4 && simdSize == 16) ||
25550-
elementSize == 8);
25580+
assert(((elementSize == 1) && (simdSize == 32)) || (elementSize == 2) || ((elementSize == 4) && (simdSize == 16)) ||
25581+
(elementSize == 8));
2555125582

25552-
if (elementSize == 8 && (simdSize == 32 || compOpportunisticallyDependsOn(InstructionSet_AVX)))
25583+
if ((elementSize == 8) && ((simdSize == 32) || compOpportunisticallyDependsOn(InstructionSet_AVX)))
2555325584
{
25554-
assert(simdSize == 16 || simdSize == 32);
25585+
assert((simdSize == 16) || (simdSize == 32));
2555525586
if (simdSize == 32)
2555625587
{
2555725588
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
@@ -25715,7 +25746,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2571525746
else
2571625747
{
2571725748
// create required clones of op2
25718-
op2Dup1 = op2DupSafe != nullptr ? gtCloneExpr(op2DupSafe) : fgMakeMultiUse(&op2);
25749+
op2Dup1 = (op2DupSafe != nullptr) ? gtCloneExpr(op2DupSafe) : fgMakeMultiUse(&op2);
2571925750
op2Dup2 = gtCloneExpr(op2Dup1);
2572025751
}
2572125752

@@ -25868,7 +25899,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2586825899
simdBaseJitType = CORINFO_TYPE_LONG;
2586925900
}
2587025901
}
25871-
if (simdSize == 16 && simdBaseJitType == CORINFO_TYPE_INT)
25902+
if ((simdSize == 16) && (simdBaseJitType == CORINFO_TYPE_INT))
2587225903
{
2587325904
simdBaseJitType = CORINFO_TYPE_UINT;
2587425905
}
@@ -25915,7 +25946,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2591525946
#if defined(TARGET_XARCH)
2591625947
if (!isShuffleNative)
2591725948
#elif defined(TARGET_ARM64)
25918-
if (!isShuffleNative && elementSize > 1)
25949+
if ((!isShuffleNative) && (elementSize > 1))
2591925950
#else
2592025951
#error Unsupported platform
2592125952
#endif // !TARGET_XARCH && !TARGET_ARM64
@@ -25945,9 +25976,9 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2594525976
#if defined(TARGET_XARCH)
2594625977
// check if we have hardware accelerated unsigned comparison
2594725978
bool hardwareAcceleratedUnsignedComparison =
25948-
simdSize == 64 ||
25949-
(elementSize < 4 && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL)) ||
25950-
(elementSize >= 4 && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512F_VL));
25979+
(simdSize == 64) ||
25980+
((elementSize < 4) && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL)) ||
25981+
((elementSize >= 4) && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512F_VL));
2595125982

2595225983
// if the hardware doesn't support direct unsigned comparison, we attempt to use signed comparison
2595325984
if (!hardwareAcceleratedUnsignedComparison)
@@ -26003,6 +26034,20 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
2600326034
return retNode;
2600426035
}
2600526036

26037+
//------------------------------------------------------------------------
26038+
// gtNewSimdShuffleNode: Creates a new simd shuffle node
26039+
//
26040+
// Arguments:
26041+
// type -- The type of the node
26042+
// op1 -- The values to shuffle
26043+
// op2 -- The indices to pick from
26044+
// simdBaseJitType -- The base jit type of the node
26045+
// simdSize -- The simd size of the node
26046+
// isShuffleNative -- Whether we're making a ShuffleNative node vs a Shuffle one
26047+
//
26048+
// Return Value:
26049+
// The shuffle node
26050+
//
2600626051
GenTree* Compiler::gtNewSimdShuffleNode(
2600726052
var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isShuffleNative)
2600826053
{
@@ -26016,7 +26061,12 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2601626061

2601726062
assert(op2 != nullptr);
2601826063
assert(op2->TypeIs(type));
26019-
assert(op2->IsCnsVec());
26064+
26065+
// If op2 is not constant, call into the gtNewSimdShuffleVariableNode routine
26066+
if (!op2->IsCnsVec())
26067+
{
26068+
return gtNewSimdShuffleVariableNode(type, op1, op2, simdBaseJitType, simdSize, isShuffleNative);
26069+
}
2602026070

2602126071
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
2602226072
assert(varTypeIsArithmetic(simdBaseType));
@@ -26051,7 +26101,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2605126101
if (isShuffleNative && gotInvalidIndex)
2605226102
{
2605326103
// Call variable implementation.
26054-
return gtNewSimdShuffleNodeVariable(type, op1, op2, simdBaseJitType, simdSize, isShuffleNative);
26104+
return gtNewSimdShuffleVariableNode(type, op1, op2, simdBaseJitType, simdSize, isShuffleNative);
2605526105
}
2605626106
if (hasIdentityShuffle)
2605726107
{
@@ -26139,9 +26189,12 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2613926189
}
2614026190
}
2614126191

26142-
// Check if the value differs in this lane vs any other lane
26192+
// Check if the value differs in this lane vs any other lane (note: lane is 128 bits, or 16 bytes)
2614326193
if (index * elementSize >= 16)
2614426194
{
26195+
// Check if the element, masked to the lane, is the same as the element in the same position of earlier lanes.
26196+
// If it differs, differsByLane will be set to true. We just compare to the first lane, as we already compared
26197+
// it to any other in between lanes.
2614526198
differsByLane |= ((vecCns.u8[index * elementSize] ^ vecCns.u8[(index * elementSize) & 15]) & 15) != 0;
2614626199
}
2614726200
}
@@ -26151,12 +26204,12 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2615126204
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
2615226205
bool isV512Supported = false;
2615326206
if ((varTypeIsByte(simdBaseType) &&
26154-
!compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512VBMI_VL)) ||
26207+
(!compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512VBMI_VL))) ||
2615526208
(varTypeIsShort(simdBaseType) &&
26156-
!compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL)) ||
26209+
(!compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL))) ||
2615726210
// This condition is the condition for when we'd have to emit something slower than what we can do with
2615826211
// NI_AVX2_Shuffle directly:
26159-
(!crossLane && (needsZero || elementSize < 4 || (elementSize == 4 && differsByLane))))
26212+
((!crossLane) && (needsZero || (elementSize < 4) || ((elementSize == 4) && differsByLane))))
2616026213
{
2616126214
// we want to treat our type like byte here
2616226215
simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;
@@ -26170,7 +26223,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2617026223
for (size_t index = 0; index < simdSize; index++)
2617126224
{
2617226225
// get pointer to our leftWants/rightWants
26173-
uint8_t* wants = (index < 16) ? &leftWants : &rightWants;
26226+
uint8_t* wants = (index < 16) ? (&leftWants) : (&rightWants);
2617426227

2617526228
// update our wants based on which values we use
2617626229
value = vecCns.u8[index];
@@ -26185,7 +26238,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2618526238

2618626239
// update our conditional select mask for if we need 2 shuffles
2618726240
value ^= static_cast<uint64_t>(index & 0x10);
26188-
selCns.u8[index] = (value < 32 && value >= 16) ? 0xFF : 0;
26241+
selCns.u8[index] = ((value < 32) && (value >= 16)) ? 0xFF : 0;
2618926242

2619026243
// normalise our shuffle mask, and check if it's default
2619126244
if (vecCns.u8[index] < 32)
@@ -26200,7 +26253,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2620026253

2620126254
// we might be able to get away with only 1 shuffle, this is the case if neither leftWants nor
2620226255
// rightWants are 3 (indicating only 0/1 side used)
26203-
if (leftWants != 3 && rightWants != 3)
26256+
if ((leftWants != 3) && (rightWants != 3))
2620426257
{
2620526258
// set result to its initial value
2620626259
retNode = op1;
@@ -26277,7 +26330,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2627726330
if (elementSize == 4)
2627826331
{
2627926332
// try to use vpshufd/vshufps instead of vpermd/vpermps.
26280-
if (!crossLane && !differsByLane)
26333+
if ((!crossLane) && (!differsByLane))
2628126334
{
2628226335
assert(!needsZero);
2628326336
unsigned immediate = (unsigned)0;
@@ -26378,7 +26431,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2637826431
if (!crossLane)
2637926432
{
2638026433
// if element size is 64-bit, try to use vshufpd instead of vpshufb.
26381-
if (elementSize == 8 && !needsZero)
26434+
if ((elementSize == 8) && (!needsZero))
2638226435
{
2638326436
unsigned immediate = (unsigned)0;
2638426437
for (size_t i = 0; i < elementCount; i++)
@@ -26394,7 +26447,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2639426447

2639526448
// if the element size is 32-bit, try to use vpshufd/vshufps instead of vpshufb,
2639626449
// if the indices (when masked to within the lane) are the same for every lane.
26397-
if (elementSize == 4 && !needsZero && !differsByLane)
26450+
if ((elementSize == 4) && (!needsZero) && (!differsByLane))
2639826451
{
2639926452
unsigned immediate = (unsigned)0;
2640026453
for (size_t i = 0; i < 4; i++)
@@ -26530,7 +26583,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
2653026583

2653126584
if (needsZero)
2653226585
{
26533-
assert((simdSize == 32) || !compIsaSupportedDebugOnly(InstructionSet_SSSE3));
26586+
assert((simdSize == 32) || (!compIsaSupportedDebugOnly(InstructionSet_SSSE3)));
2653426587

2653526588
op2 = gtNewVconNode(type);
2653626589
op2->AsVecCon()->gtSimdVal = mskCns;

0 commit comments

Comments
 (0)