@@ -18393,10 +18393,11 @@ unsigned GenTreeVecCon::ElementCount(unsigned simdSize, var_types simdBaseType)
18393
18393
bool Compiler::IsValidForShuffle(GenTree* indices,
18394
18394
unsigned simdSize,
18395
18395
var_types simdBaseType,
18396
- bool* canBecomeValid) const
18396
+ bool* canBecomeValid,
18397
+ bool isShuffleNative) const
18397
18398
{
18398
18399
#if defined(TARGET_XARCH)
18399
- if (canBecomeValid)
18400
+ if (canBecomeValid != nullptr )
18400
18401
{
18401
18402
*canBecomeValid = false;
18402
18403
}
@@ -18414,7 +18415,7 @@ bool Compiler::IsValidForShuffle(GenTree* indices,
18414
18415
}
18415
18416
else if (simdSize == 64)
18416
18417
{
18417
- if (varTypeIsByte(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI))
18418
+ if (varTypeIsByte(simdBaseType) && ( !compOpportunisticallyDependsOn(InstructionSet_AVX512VBMI) ))
18418
18419
{
18419
18420
// TYP_BYTE, TYP_UBYTE need AVX512VBMI.
18420
18421
return false;
@@ -18424,13 +18425,27 @@ bool Compiler::IsValidForShuffle(GenTree* indices,
18424
18425
{
18425
18426
assert(simdSize == 16);
18426
18427
18427
- if (varTypeIsSmall(simdBaseType) && !compOpportunisticallyDependsOn(InstructionSet_SSSE3))
18428
+ if (varTypeIsSmall(simdBaseType) && ( !compOpportunisticallyDependsOn(InstructionSet_SSSE3) ))
18428
18429
{
18429
18430
// TYP_BYTE, TYP_UBYTE, TYP_SHORT, and TYP_USHORT need SSSE3 to be able to shuffle any operation
18430
18431
return false;
18431
18432
}
18432
18433
18433
- if (!indices->IsCnsVec() && !compOpportunisticallyDependsOn(InstructionSet_SSSE3))
18434
+ bool isVariableShuffle = !indices->IsCnsVec();
18435
+ if ((!isVariableShuffle) && isShuffleNative)
18436
+ {
18437
+ // ShuffleNative with constant indices with 1 or more out of range indices is emitted as variable indices.
18438
+ for (size_t index = 0; index < elementCount; index++)
18439
+ {
18440
+ uint64_t value = op2->GetIntegralVectorConstElement(index, simdBaseType);
18441
+ if (value >= elementCount)
18442
+ {
18443
+ isVariableShuffle = true;
18444
+ break;
18445
+ }
18446
+ }
18447
+ }
18448
+ if (isVariableShuffle && (!compOpportunisticallyDependsOn(InstructionSet_SSSE3)))
18434
18449
{
18435
18450
// the variable implementation for Vector128 Shuffle always needs SSSE3
18436
18451
// however, this can become valid later if it becomes constant
@@ -18443,7 +18458,7 @@ bool Compiler::IsValidForShuffle(GenTree* indices,
18443
18458
}
18444
18459
#endif // TARGET_XARCH
18445
18460
18446
- if (canBecomeValid)
18461
+ if (canBecomeValid != nullptr )
18447
18462
{
18448
18463
*canBecomeValid = true;
18449
18464
}
@@ -25391,7 +25406,23 @@ GenTree* Compiler::gtNewSimdRoundNode(var_types type, GenTree* op1, CorInfoType
25391
25406
return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize);
25392
25407
}
25393
25408
25394
- GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25409
+ //------------------------------------------------------------------------
25410
+ // gtNewSimdShuffleVariableNode: Creates a new simd shuffle node (with variable indices, or a case isn't handled in
25411
+ // gtNewSimdShuffleNode for ShuffleUnsafe with out of bounds indices) - this is a helper function for
25412
+ // gtNewSimdShuffleNode & should just be invoked by it indirectly, instead of other callers using it
25413
+ //
25414
+ // Arguments:
25415
+ // type -- The type of the node
25416
+ // op1 -- The values to shuffle
25417
+ // op2 -- The indices to pick from (variable)
25418
+ // simdBaseJitType -- The base jit type of the node
25419
+ // simdSize -- The simd size of the node
25420
+ // isShuffleNative -- Whether we're making a ShuffleNative node vs a Shuffle one
25421
+ //
25422
+ // Return Value:
25423
+ // The shuffle node
25424
+ //
25425
+ GenTree* Compiler::gtNewSimdShuffleVariableNode(
25395
25426
var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isShuffleNative)
25396
25427
{
25397
25428
assert(IsBaselineSimdIsaSupportedDebugOnly());
@@ -25405,7 +25436,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25405
25436
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
25406
25437
assert(op2 != nullptr);
25407
25438
assert(op2->TypeIs(type));
25408
- assert(!op2->IsCnsVec() || isShuffleNative);
25439
+ assert(( !op2->IsCnsVec() ) || isShuffleNative);
25409
25440
25410
25441
GenTree* retNode = nullptr;
25411
25442
GenTree* cnsNode = nullptr;
@@ -25419,7 +25450,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25419
25450
#if defined(TARGET_XARCH)
25420
25451
if (!isShuffleNative)
25421
25452
#elif defined(TARGET_ARM64)
25422
- if (!isShuffleNative && elementSize > 1)
25453
+ if (( !isShuffleNative) && ( elementSize > 1) )
25423
25454
#else
25424
25455
#error Unsupported platform
25425
25456
#endif // !TARGET_XARCH && !TARGET_ARM64
@@ -25474,7 +25505,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25474
25505
retNode->SetReverseOp();
25475
25506
}
25476
25507
}
25477
- else if (elementSize == 1 && simdSize == 16)
25508
+ else if (( elementSize == 1) && ( simdSize == 16) )
25478
25509
{
25479
25510
assert(compIsaSupportedDebugOnly(InstructionSet_SSSE3));
25480
25511
@@ -25483,7 +25514,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25483
25514
// high bit on index gives 0 already
25484
25515
canUseSignedComparisonHint = true;
25485
25516
}
25486
- else if (elementSize == 1 && simdSize == 32 &&
25517
+ else if (( elementSize == 1) && ( simdSize == 32) &&
25487
25518
compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512VBMI_VL))
25488
25519
{
25489
25520
NamedIntrinsic intrinsic = isV512Supported ? NI_AVX512VBMI_VL_PermuteVar32x8 : NI_AVX10v1_PermuteVar32x8;
@@ -25492,26 +25523,26 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25492
25523
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, intrinsic, simdBaseJitType, simdSize);
25493
25524
retNode->SetReverseOp();
25494
25525
}
25495
- else if (elementSize == 2 && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL))
25526
+ else if (( elementSize == 2) && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL))
25496
25527
{
25497
- assert(simdSize == 16 || simdSize == 32);
25528
+ assert(( simdSize == 16) || ( simdSize == 32) );
25498
25529
NamedIntrinsic intrinsic;
25499
25530
if (isV512Supported)
25500
25531
{
25501
- intrinsic = simdSize == 16 ? NI_AVX512BW_VL_PermuteVar8x16 : NI_AVX512BW_VL_PermuteVar16x16;
25532
+ intrinsic = ( simdSize == 16) ? NI_AVX512BW_VL_PermuteVar8x16 : NI_AVX512BW_VL_PermuteVar16x16;
25502
25533
}
25503
25534
else
25504
25535
{
25505
- intrinsic = simdSize == 16 ? NI_AVX10v1_PermuteVar8x16 : NI_AVX10v1_PermuteVar16x16;
25536
+ intrinsic = ( simdSize == 16) ? NI_AVX10v1_PermuteVar8x16 : NI_AVX10v1_PermuteVar16x16;
25506
25537
}
25507
25538
25508
25539
// swap the operands to match the encoding requirements
25509
25540
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, intrinsic, simdBaseJitType, simdSize);
25510
25541
retNode->SetReverseOp();
25511
25542
}
25512
- else if (elementSize == 4 && (simdSize == 32 || compOpportunisticallyDependsOn(InstructionSet_AVX)))
25543
+ else if (( elementSize == 4) && (( simdSize == 32) || compOpportunisticallyDependsOn(InstructionSet_AVX)))
25513
25544
{
25514
- assert(simdSize == 16 || simdSize == 32);
25545
+ assert(( simdSize == 16) || ( simdSize == 32) );
25515
25546
25516
25547
if (simdSize == 32)
25517
25548
{
@@ -25528,7 +25559,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25528
25559
retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, NI_AVX_PermuteVar, CORINFO_TYPE_FLOAT, simdSize);
25529
25560
}
25530
25561
}
25531
- else if (elementSize == 8 && simdSize == 32 &&
25562
+ else if (( elementSize == 8) && ( simdSize == 32) &&
25532
25563
compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512F_VL))
25533
25564
{
25534
25565
NamedIntrinsic intrinsic = isV512Supported ? NI_AVX512F_VL_PermuteVar4x64 : NI_AVX10v1_PermuteVar4x64;
@@ -25537,7 +25568,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25537
25568
retNode = gtNewSimdHWIntrinsicNode(type, op2, op1, intrinsic, simdBaseJitType, simdSize);
25538
25569
retNode->SetReverseOp();
25539
25570
}
25540
- else if (elementSize == 8 && simdSize == 16 &&
25571
+ else if (( elementSize == 8) && ( simdSize == 16) &&
25541
25572
compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512F_VL))
25542
25573
{
25543
25574
GenTree* op1Copy = fgMakeMultiUse(&op1); // just use op1 again for the other variable
@@ -25546,12 +25577,12 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25546
25577
}
25547
25578
else
25548
25579
{
25549
- assert((elementSize == 1 && simdSize == 32) || elementSize == 2 || (elementSize == 4 && simdSize == 16) ||
25550
- elementSize == 8);
25580
+ assert((( elementSize == 1) && ( simdSize == 32)) || ( elementSize == 2) || (( elementSize == 4) && ( simdSize == 16) ) ||
25581
+ ( elementSize == 8) );
25551
25582
25552
- if (elementSize == 8 && (simdSize == 32 || compOpportunisticallyDependsOn(InstructionSet_AVX)))
25583
+ if (( elementSize == 8) && (( simdSize == 32) || compOpportunisticallyDependsOn(InstructionSet_AVX)))
25553
25584
{
25554
- assert(simdSize == 16 || simdSize == 32);
25585
+ assert(( simdSize == 16) || ( simdSize == 32) );
25555
25586
if (simdSize == 32)
25556
25587
{
25557
25588
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
@@ -25715,7 +25746,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25715
25746
else
25716
25747
{
25717
25748
// create required clones of op2
25718
- op2Dup1 = op2DupSafe != nullptr ? gtCloneExpr(op2DupSafe) : fgMakeMultiUse(&op2);
25749
+ op2Dup1 = ( op2DupSafe != nullptr) ? gtCloneExpr(op2DupSafe) : fgMakeMultiUse(&op2);
25719
25750
op2Dup2 = gtCloneExpr(op2Dup1);
25720
25751
}
25721
25752
@@ -25868,7 +25899,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25868
25899
simdBaseJitType = CORINFO_TYPE_LONG;
25869
25900
}
25870
25901
}
25871
- if (simdSize == 16 && simdBaseJitType == CORINFO_TYPE_INT)
25902
+ if (( simdSize == 16) && ( simdBaseJitType == CORINFO_TYPE_INT) )
25872
25903
{
25873
25904
simdBaseJitType = CORINFO_TYPE_UINT;
25874
25905
}
@@ -25915,7 +25946,7 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25915
25946
#if defined(TARGET_XARCH)
25916
25947
if (!isShuffleNative)
25917
25948
#elif defined(TARGET_ARM64)
25918
- if (!isShuffleNative && elementSize > 1)
25949
+ if (( !isShuffleNative) && ( elementSize > 1) )
25919
25950
#else
25920
25951
#error Unsupported platform
25921
25952
#endif // !TARGET_XARCH && !TARGET_ARM64
@@ -25945,9 +25976,9 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
25945
25976
#if defined(TARGET_XARCH)
25946
25977
// check if we have hardware accelerated unsigned comparison
25947
25978
bool hardwareAcceleratedUnsignedComparison =
25948
- simdSize == 64 ||
25949
- (elementSize < 4 && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL)) ||
25950
- (elementSize >= 4 && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512F_VL));
25979
+ ( simdSize == 64) ||
25980
+ (( elementSize < 4) && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL)) ||
25981
+ (( elementSize >= 4) && compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512F_VL));
25951
25982
25952
25983
// if the hardware doesn't support direct unsigned comparison, we attempt to use signed comparison
25953
25984
if (!hardwareAcceleratedUnsignedComparison)
@@ -26003,6 +26034,20 @@ GenTree* Compiler::gtNewSimdShuffleNodeVariable(
26003
26034
return retNode;
26004
26035
}
26005
26036
26037
+ //------------------------------------------------------------------------
26038
+ // gtNewSimdShuffleNode: Creates a new simd shuffle node
26039
+ //
26040
+ // Arguments:
26041
+ // type -- The type of the node
26042
+ // op1 -- The values to shuffle
26043
+ // op2 -- The indices to pick from
26044
+ // simdBaseJitType -- The base jit type of the node
26045
+ // simdSize -- The simd size of the node
26046
+ // isShuffleNative -- Whether we're making a ShuffleNative node vs a Shuffle one
26047
+ //
26048
+ // Return Value:
26049
+ // The shuffle node
26050
+ //
26006
26051
GenTree* Compiler::gtNewSimdShuffleNode(
26007
26052
var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isShuffleNative)
26008
26053
{
@@ -26016,7 +26061,12 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26016
26061
26017
26062
assert(op2 != nullptr);
26018
26063
assert(op2->TypeIs(type));
26019
- assert(op2->IsCnsVec());
26064
+
26065
+ // If op2 is not constant, call into the gtNewSimdShuffleVariableNode routine
26066
+ if (!op2->IsCnsVec())
26067
+ {
26068
+ return gtNewSimdShuffleVariableNode(type, op1, op2, simdBaseJitType, simdSize, isShuffleNative);
26069
+ }
26020
26070
26021
26071
var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType);
26022
26072
assert(varTypeIsArithmetic(simdBaseType));
@@ -26051,7 +26101,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26051
26101
if (isShuffleNative && gotInvalidIndex)
26052
26102
{
26053
26103
// Call variable implementation.
26054
- return gtNewSimdShuffleNodeVariable (type, op1, op2, simdBaseJitType, simdSize, isShuffleNative);
26104
+ return gtNewSimdShuffleVariableNode (type, op1, op2, simdBaseJitType, simdSize, isShuffleNative);
26055
26105
}
26056
26106
if (hasIdentityShuffle)
26057
26107
{
@@ -26139,9 +26189,12 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26139
26189
}
26140
26190
}
26141
26191
26142
- // Check if the value differs in this lane vs any other lane
26192
+ // Check if the value differs in this lane vs any other lane (note: lane is 128 bits, or 16 bytes)
26143
26193
if (index * elementSize >= 16)
26144
26194
{
26195
+ // Check if the element, masked to the lane, is the same as the element in the same position of earlier lanes.
26196
+ // If it differs, differsByLane will be set to true. We just compare to the first lane, as we already compared
26197
+ // it to any other in between lanes.
26145
26198
differsByLane |= ((vecCns.u8[index * elementSize] ^ vecCns.u8[(index * elementSize) & 15]) & 15) != 0;
26146
26199
}
26147
26200
}
@@ -26151,12 +26204,12 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26151
26204
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));
26152
26205
bool isV512Supported = false;
26153
26206
if ((varTypeIsByte(simdBaseType) &&
26154
- !compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512VBMI_VL)) ||
26207
+ ( !compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512VBMI_VL) )) ||
26155
26208
(varTypeIsShort(simdBaseType) &&
26156
- !compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL)) ||
26209
+ ( !compIsEvexOpportunisticallySupported(isV512Supported, InstructionSet_AVX512BW_VL) )) ||
26157
26210
// This condition is the condition for when we'd have to emit something slower than what we can do with
26158
26211
// NI_AVX2_Shuffle directly:
26159
- (!crossLane && (needsZero || elementSize < 4 || (elementSize == 4 && differsByLane))))
26212
+ (( !crossLane) && (needsZero || ( elementSize < 4) || (( elementSize == 4) && differsByLane))))
26160
26213
{
26161
26214
// we want to treat our type like byte here
26162
26215
simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE;
@@ -26170,7 +26223,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26170
26223
for (size_t index = 0; index < simdSize; index++)
26171
26224
{
26172
26225
// get pointer to our leftWants/rightWants
26173
- uint8_t* wants = (index < 16) ? &leftWants : &rightWants;
26226
+ uint8_t* wants = (index < 16) ? ( &leftWants) : ( &rightWants) ;
26174
26227
26175
26228
// update our wants based on which values we use
26176
26229
value = vecCns.u8[index];
@@ -26185,7 +26238,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26185
26238
26186
26239
// update our conditional select mask for if we need 2 shuffles
26187
26240
value ^= static_cast<uint64_t>(index & 0x10);
26188
- selCns.u8[index] = (value < 32 && value >= 16) ? 0xFF : 0;
26241
+ selCns.u8[index] = (( value < 32) && ( value >= 16) ) ? 0xFF : 0;
26189
26242
26190
26243
// normalise our shuffle mask, and check if it's default
26191
26244
if (vecCns.u8[index] < 32)
@@ -26200,7 +26253,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26200
26253
26201
26254
// we might be able to get away with only 1 shuffle, this is the case if neither leftWants nor
26202
26255
// rightWants are 3 (indicating only 0/1 side used)
26203
- if (leftWants != 3 && rightWants != 3)
26256
+ if (( leftWants != 3) && ( rightWants != 3) )
26204
26257
{
26205
26258
// set result to its initial value
26206
26259
retNode = op1;
@@ -26277,7 +26330,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26277
26330
if (elementSize == 4)
26278
26331
{
26279
26332
// try to use vpshufd/vshufps instead of vpermd/vpermps.
26280
- if (!crossLane && !differsByLane)
26333
+ if (( !crossLane) && ( !differsByLane) )
26281
26334
{
26282
26335
assert(!needsZero);
26283
26336
unsigned immediate = (unsigned)0;
@@ -26378,7 +26431,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26378
26431
if (!crossLane)
26379
26432
{
26380
26433
// if element size is 64-bit, try to use vshufpd instead of vpshufb.
26381
- if (elementSize == 8 && !needsZero)
26434
+ if (( elementSize == 8) && ( !needsZero) )
26382
26435
{
26383
26436
unsigned immediate = (unsigned)0;
26384
26437
for (size_t i = 0; i < elementCount; i++)
@@ -26394,7 +26447,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26394
26447
26395
26448
// if the element size is 32-bit, try to use vpshufd/vshufps instead of vpshufb,
26396
26449
// if the indices (when masked to within the lane) are the same for every lane.
26397
- if (elementSize == 4 && !needsZero && !differsByLane)
26450
+ if (( elementSize == 4) && ( !needsZero) && ( !differsByLane) )
26398
26451
{
26399
26452
unsigned immediate = (unsigned)0;
26400
26453
for (size_t i = 0; i < 4; i++)
@@ -26530,7 +26583,7 @@ GenTree* Compiler::gtNewSimdShuffleNode(
26530
26583
26531
26584
if (needsZero)
26532
26585
{
26533
- assert((simdSize == 32) || !compIsaSupportedDebugOnly(InstructionSet_SSSE3));
26586
+ assert((simdSize == 32) || ( !compIsaSupportedDebugOnly(InstructionSet_SSSE3) ));
26534
26587
26535
26588
op2 = gtNewVconNode(type);
26536
26589
op2->AsVecCon()->gtSimdVal = mskCns;
0 commit comments