From b042369b1a50039c5f2bfad077c977bb66480e19 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Mon, 14 Jul 2025 12:38:53 -0700 Subject: [PATCH 1/3] allow more containment opts in tier0 --- src/coreclr/jit/lowerxarch.cpp | 155 ++++++++++++++------------------- 1 file changed, 63 insertions(+), 92 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 784115399ced4c..84d9b629da1b2e 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9255,15 +9255,15 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX_LoadAlignedVector256: case NI_AVX512_LoadAlignedVector512: { - // In minOpts, we need to ensure that an unaligned address will fault when an explicit LoadAligned is used. - // Non-VEX encoded instructions will fault if an unaligned SIMD16 load is contained but will not for scalar - // loads, and VEX-encoded instructions will not fault for unaligned loads in any case. + // For debug code, we need to ensure that an unaligned address will fault when an explicit LoadAligned is + // used. Non-VEX encoded instructions will fault if an unaligned SIMD16 load is contained but will not for + // scalar loads, and VEX-encoded instructions will not fault for unaligned loads in any case. // // When optimizations are enabled, we want to contain any aligned load that is large enough for the parent's // requirement. - return (supportsSIMDLoad && - ((!comp->canUseVexEncoding() && expectedSize == genTypeSize(TYP_SIMD16)) || !comp->opts.MinOpts())); + return (supportsSIMDLoad && ((!comp->canUseVexEncoding() && expectedSize == genTypeSize(TYP_SIMD16)) || + comp->opts.Tier0OptimizationEnabled())); } case NI_X86Base_LoadScalarVector128: @@ -9279,7 +9279,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512_BroadcastScalarToVector512: { - if (comp->opts.MinOpts() || !comp->canUseEmbeddedBroadcast()) + if (!comp->opts.Tier0OptimizationEnabled() || !comp->canUseEmbeddedBroadcast()) { return false; } @@ -9369,113 +9369,84 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, return; } - // We use the child node's size for the broadcast node, because the parent may consume more than its own size. - // The containment check has already validated that the child is sufficiently large. - // // We use the parent node's base type, because we must ensure that the constant repeats correctly for that size, // regardless of how the constant vector was created. - var_types simdType = childNode->TypeGet(); - var_types simdBaseType = parentNode->GetSimdBaseType(); - CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); - bool isCreatedFromScalar = true; + var_types simdBaseType = parentNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); + + if (varTypeIsSmall(simdBaseType) || !childNode->IsBroadcast(simdBaseType)) + { + MakeSrcContained(parentNode, childNode); + return; + } + + // We use the child node's size for the broadcast node, because the parent may consume more than its own size. + // The containment check has already validated that the child is sufficiently large. + + var_types simdType = childNode->TypeGet(); + NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; + GenTree* constScalar = nullptr; - if (varTypeIsSmall(simdBaseType)) + if (simdType == TYP_SIMD32) + { + broadcastName = NI_AVX2_BroadcastScalarToVector256; + } + else if (simdType == TYP_SIMD64) { - isCreatedFromScalar = false; + broadcastName = NI_AVX512_BroadcastScalarToVector512; } else { - isCreatedFromScalar = childNode->IsBroadcast(simdBaseType); + assert(simdType == TYP_SIMD16); } - if (isCreatedFromScalar) + switch (simdBaseType) { - NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; - if (simdType == TYP_SIMD32) + case TYP_FLOAT: { - broadcastName = NI_AVX2_BroadcastScalarToVector256; + float scalar = childNode->gtSimdVal.f32[0]; + constScalar = comp->gtNewDconNodeF(scalar); + break; } - else if (simdType == TYP_SIMD64) + case TYP_DOUBLE: { - broadcastName = NI_AVX512_BroadcastScalarToVector512; + double scalar = childNode->gtSimdVal.f64[0]; + constScalar = comp->gtNewDconNodeD(scalar); + break; } - else + case TYP_INT: + case TYP_UINT: { - assert(simdType == TYP_SIMD16); + int32_t scalar = childNode->gtSimdVal.i32[0]; + constScalar = comp->gtNewIconNode(scalar); + break; } - - GenTree* constScalar = nullptr; - switch (simdBaseType) + case TYP_LONG: + case TYP_ULONG: { - case TYP_FLOAT: - { - float scalar = childNode->gtSimdVal.f32[0]; - constScalar = comp->gtNewDconNodeF(scalar); - break; - } - case TYP_DOUBLE: - { - double scalar = childNode->gtSimdVal.f64[0]; - constScalar = comp->gtNewDconNodeD(scalar); - break; - } - case TYP_INT: - { - int32_t scalar = childNode->gtSimdVal.i32[0]; - constScalar = comp->gtNewIconNode(scalar, simdBaseType); - break; - } - case TYP_UINT: - { - uint32_t scalar = childNode->gtSimdVal.u32[0]; - constScalar = comp->gtNewIconNode(scalar, TYP_INT); - break; - } - case TYP_LONG: - case TYP_ULONG: - { - int64_t scalar = childNode->gtSimdVal.i64[0]; - constScalar = comp->gtNewLconNode(scalar); - break; - } - default: - unreached(); + int64_t scalar = childNode->gtSimdVal.i64[0]; + constScalar = comp->gtNewLconNode(scalar); + break; } + default: + unreached(); + } - GenTreeHWIntrinsic* createScalar = - comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, - 16); - GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, - simdBaseJitType, genTypeSize(simdType)); - BlockRange().InsertBefore(childNode, broadcastNode); - BlockRange().InsertBefore(broadcastNode, createScalar); - BlockRange().InsertBefore(createScalar, constScalar); - LIR::Use use; - if (BlockRange().TryGetUse(childNode, &use)) - { - use.ReplaceWith(broadcastNode); - } - else - { - broadcastNode->SetUnusedValue(); - } + GenTreeHWIntrinsic* broadcastNode = + comp->gtNewSimdHWIntrinsicNode(simdType, constScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); - BlockRange().Remove(childNode); - LowerNode(createScalar); - LowerNode(broadcastNode); - if (varTypeIsFloating(simdBaseType)) - { - MakeSrcContained(broadcastNode, createScalar); - } - else if (constScalar->TypeIs(TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG)) - { - MakeSrcContained(broadcastNode, constScalar); - } - MakeSrcContained(parentNode, broadcastNode); - return; - } - MakeSrcContained(parentNode, childNode); + BlockRange().InsertBefore(childNode, constScalar, broadcastNode); + BlockRange().Remove(childNode); + + GenTree** use = nullptr; + bool useFound = parentNode->TryGetUse(childNode, &use); + assert(useFound); + + parentNode->ReplaceOperand(use, broadcastNode); + + MakeSrcContained(broadcastNode, constScalar); + MakeSrcContained(parentNode, broadcastNode); } //------------------------------------------------------------------------ From f0bef79f886c0a8885bd7e2ab13896f3d3de3738 Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Tue, 15 Jul 2025 13:04:03 -0700 Subject: [PATCH 2/3] disable aligned load containment when opts disabled --- src/coreclr/jit/lowerxarch.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 84d9b629da1b2e..c9d7a4eff5bd87 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9262,8 +9262,8 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre // When optimizations are enabled, we want to contain any aligned load that is large enough for the parent's // requirement. - return (supportsSIMDLoad && ((!comp->canUseVexEncoding() && expectedSize == genTypeSize(TYP_SIMD16)) || - comp->opts.Tier0OptimizationEnabled())); + return (supportsSIMDLoad && (comp->opts.OptimizationEnabled() || + (!comp->canUseVexEncoding() && expectedSize == genTypeSize(TYP_SIMD16)))); } case NI_X86Base_LoadScalarVector128: @@ -9436,7 +9436,7 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, constScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); - BlockRange().InsertBefore(childNode, constScalar, broadcastNode); + BlockRange().InsertBefore(parentNode, constScalar, broadcastNode); BlockRange().Remove(childNode); GenTree** use = nullptr; From e16f00ebbdec67656ccd1eeaf00ee3cea6e1a64d Mon Sep 17 00:00:00 2001 From: Clinton Ingram Date: Wed, 16 Jul 2025 00:12:14 -0700 Subject: [PATCH 3/3] revert TryFoldCnsVecForEmbeddedBroadcast changes --- src/coreclr/jit/lowerxarch.cpp | 143 ++++++++++++++++++++------------- 1 file changed, 86 insertions(+), 57 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c9d7a4eff5bd87..053373e7622329 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -9369,84 +9369,113 @@ void Lowering::TryFoldCnsVecForEmbeddedBroadcast(GenTreeHWIntrinsic* parentNode, return; } - // We use the parent node's base type, because we must ensure that the constant repeats correctly for that size, - // regardless of how the constant vector was created. - - var_types simdBaseType = parentNode->GetSimdBaseType(); - CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); - - if (varTypeIsSmall(simdBaseType) || !childNode->IsBroadcast(simdBaseType)) - { - MakeSrcContained(parentNode, childNode); - return; - } - // We use the child node's size for the broadcast node, because the parent may consume more than its own size. // The containment check has already validated that the child is sufficiently large. + // + // We use the parent node's base type, because we must ensure that the constant repeats correctly for that size, + // regardless of how the constant vector was created. - var_types simdType = childNode->TypeGet(); - NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; - GenTree* constScalar = nullptr; + var_types simdType = childNode->TypeGet(); + var_types simdBaseType = parentNode->GetSimdBaseType(); + CorInfoType simdBaseJitType = parentNode->GetSimdBaseJitType(); + bool isCreatedFromScalar = true; - if (simdType == TYP_SIMD32) - { - broadcastName = NI_AVX2_BroadcastScalarToVector256; - } - else if (simdType == TYP_SIMD64) + if (varTypeIsSmall(simdBaseType)) { - broadcastName = NI_AVX512_BroadcastScalarToVector512; + isCreatedFromScalar = false; } else { - assert(simdType == TYP_SIMD16); + isCreatedFromScalar = childNode->IsBroadcast(simdBaseType); } - switch (simdBaseType) + if (isCreatedFromScalar) { - case TYP_FLOAT: + NamedIntrinsic broadcastName = NI_AVX2_BroadcastScalarToVector128; + if (simdType == TYP_SIMD32) { - float scalar = childNode->gtSimdVal.f32[0]; - constScalar = comp->gtNewDconNodeF(scalar); - break; + broadcastName = NI_AVX2_BroadcastScalarToVector256; } - case TYP_DOUBLE: + else if (simdType == TYP_SIMD64) { - double scalar = childNode->gtSimdVal.f64[0]; - constScalar = comp->gtNewDconNodeD(scalar); - break; + broadcastName = NI_AVX512_BroadcastScalarToVector512; } - case TYP_INT: - case TYP_UINT: + else { - int32_t scalar = childNode->gtSimdVal.i32[0]; - constScalar = comp->gtNewIconNode(scalar); - break; + assert(simdType == TYP_SIMD16); } - case TYP_LONG: - case TYP_ULONG: + + GenTree* constScalar = nullptr; + switch (simdBaseType) { - int64_t scalar = childNode->gtSimdVal.i64[0]; - constScalar = comp->gtNewLconNode(scalar); - break; + case TYP_FLOAT: + { + float scalar = childNode->gtSimdVal.f32[0]; + constScalar = comp->gtNewDconNodeF(scalar); + break; + } + case TYP_DOUBLE: + { + double scalar = childNode->gtSimdVal.f64[0]; + constScalar = comp->gtNewDconNodeD(scalar); + break; + } + case TYP_INT: + { + int32_t scalar = childNode->gtSimdVal.i32[0]; + constScalar = comp->gtNewIconNode(scalar, simdBaseType); + break; + } + case TYP_UINT: + { + uint32_t scalar = childNode->gtSimdVal.u32[0]; + constScalar = comp->gtNewIconNode(scalar, TYP_INT); + break; + } + case TYP_LONG: + case TYP_ULONG: + { + int64_t scalar = childNode->gtSimdVal.i64[0]; + constScalar = comp->gtNewLconNode(scalar); + break; + } + default: + unreached(); } - default: - unreached(); - } - - GenTreeHWIntrinsic* broadcastNode = - comp->gtNewSimdHWIntrinsicNode(simdType, constScalar, broadcastName, simdBaseJitType, genTypeSize(simdType)); - - BlockRange().InsertBefore(parentNode, constScalar, broadcastNode); - BlockRange().Remove(childNode); - - GenTree** use = nullptr; - bool useFound = parentNode->TryGetUse(childNode, &use); - assert(useFound); - parentNode->ReplaceOperand(use, broadcastNode); + GenTreeHWIntrinsic* createScalar = + comp->gtNewSimdHWIntrinsicNode(TYP_SIMD16, constScalar, NI_Vector128_CreateScalarUnsafe, simdBaseJitType, + 16); + GenTreeHWIntrinsic* broadcastNode = comp->gtNewSimdHWIntrinsicNode(simdType, createScalar, broadcastName, + simdBaseJitType, genTypeSize(simdType)); + BlockRange().InsertBefore(childNode, broadcastNode); + BlockRange().InsertBefore(broadcastNode, createScalar); + BlockRange().InsertBefore(createScalar, constScalar); + LIR::Use use; + if (BlockRange().TryGetUse(childNode, &use)) + { + use.ReplaceWith(broadcastNode); + } + else + { + broadcastNode->SetUnusedValue(); + } - MakeSrcContained(broadcastNode, constScalar); - MakeSrcContained(parentNode, broadcastNode); + BlockRange().Remove(childNode); + LowerNode(createScalar); + LowerNode(broadcastNode); + if (varTypeIsFloating(simdBaseType)) + { + MakeSrcContained(broadcastNode, createScalar); + } + else if (constScalar->TypeIs(TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG)) + { + MakeSrcContained(broadcastNode, constScalar); + } + MakeSrcContained(parentNode, broadcastNode); + return; + } + MakeSrcContained(parentNode, childNode); } //------------------------------------------------------------------------