From ce655e3f6724a4bf0494fd7b77e9ff2fea478aa2 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Thu, 5 Oct 2023 21:16:14 +0200 Subject: [PATCH] Merged stores: Fix alignment-related issues and enable SIMD where possible (#92939) Co-authored-by: SingleAccretion <62474226+SingleAccretion@users.noreply.github.com> --- src/coreclr/jit/gentree.h | 1 + src/coreclr/jit/lower.cpp | 197 +++++++++++++++++++++++++++++++++----- 2 files changed, 175 insertions(+), 23 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 635e97e7072a22..b532c09591859f 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -490,6 +490,7 @@ enum GenTreeFlags : unsigned int GTF_IND_INVARIANT = 0x01000000, // GT_IND -- the target is invariant (a prejit indirection) GTF_IND_NONNULL = 0x00400000, // GT_IND -- the indirection never returns null (zero) GTF_IND_INITCLASS = 0x00200000, // OperIsIndir() -- the indirection requires preceding static cctor + GTF_IND_ALLOW_NON_ATOMIC = 0x00100000, // GT_IND -- this memory access does not need to be atomic GTF_IND_COPYABLE_FLAGS = GTF_IND_VOLATILE | GTF_IND_NONFAULTING | GTF_IND_UNALIGNED | GTF_IND_INITCLASS, GTF_IND_FLAGS = GTF_IND_COPYABLE_FLAGS | GTF_IND_NONNULL | GTF_IND_TGT_NOT_HEAP | GTF_IND_TGT_HEAP | GTF_IND_INVARIANT, diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 07f1f95ff4ebd5..0fb173425120c4 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -7833,26 +7833,34 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo } // Data has to be INT_CNS, can be also VEC_CNS in future. - if (!ind->Data()->IsCnsIntOrI()) + if (!ind->Data()->IsCnsIntOrI() && !ind->Data()->IsVectorConst()) { return false; } + auto isNodeInvariant = [](Compiler* comp, GenTree* node, bool allowNull) { + if (node == nullptr) + { + return allowNull; + } + // We can allow bigger trees here, but it's not clear if it's worth it. + return node->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(node->AsLclVar()->GetLclNum()); + }; + data->targetType = ind->TypeGet(); data->value = ind->Data(); if (ind->Addr()->OperIs(GT_LEA)) { GenTree* base = ind->Addr()->AsAddrMode()->Base(); GenTree* index = ind->Addr()->AsAddrMode()->Index(); - if ((base == nullptr) || !base->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(base->AsLclVar()->GetLclNum())) + if (!isNodeInvariant(comp, base, false)) { // Base must be a local. It's possible for it to be nullptr when index is not null, // but let's ignore such cases. return false; } - if ((index != nullptr) && - (!index->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(index->AsLclVar()->GetLclNum()))) + if (!isNodeInvariant(comp, index, true)) { // Index should be either nullptr or a local. return false; @@ -7863,7 +7871,7 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo data->scale = ind->Addr()->AsAddrMode()->GetScale(); data->offset = ind->Addr()->AsAddrMode()->Offset(); } - else if (ind->Addr()->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(ind->Addr()->AsLclVar()->GetLclNum())) + else if (isNodeInvariant(comp, ind->Addr(), true)) { // Address is just a local, no offset, scale is 1 data->baseAddr = ind->Addr(); @@ -7919,6 +7927,15 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) return; } + // TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial. + // The algorithm does support TYP_REF (with null value), but it seems to be not worth + // it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two + // items at once. Although, it may be profitable to do "stp q0, q0, [addr]". + if (!varTypeIsIntegral(ind) && !varTypeIsSIMD(ind)) + { + return; + } + // We're going to do it in a loop while we see suitable STOREINDs to coalesce. // E.g.: we have the following LIR sequence: // @@ -7933,12 +7950,6 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) // to get a single store of 8 bytes. do { - // This check is not really needed, just for better throughput. - if (!ind->TypeIs(TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT)) - { - return; - } - StoreCoalescingData currData; StoreCoalescingData prevData; @@ -8002,6 +8013,57 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) return; } + // Now the hardest part: decide whether it's safe to use an unaligned write. + // + // IND is always fine (and all IND created here from such) + // IND is not required to be atomic per our Memory Model + const bool allowsNonAtomic = + ((ind->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0) && ((prevInd->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0); + + if (!allowsNonAtomic && (genTypeSize(ind) > 1) && !varTypeIsSIMD(ind)) + { + // TODO-CQ: if we see that the target is a local memory (non address exposed) + // we can use any type (including SIMD) for a new load. + + // Ignore indices for now, they can invalidate our alignment assumptions. + // Although, we can take scale into account. + if (currData.index != nullptr) + { + return; + } + + // Base address being TYP_REF gives us a hint that data is pointer-aligned. + if (!currData.baseAddr->TypeIs(TYP_REF)) + { + return; + } + + // Check whether the combined indir is still aligned. + bool isCombinedIndirAtomic = (genTypeSize(ind) < TARGET_POINTER_SIZE) && + (min(prevData.offset, currData.offset) % (genTypeSize(ind) * 2)) == 0; + + if (genTypeSize(ind) == TARGET_POINTER_SIZE) + { +#ifdef TARGET_ARM64 + // Per Arm Architecture Reference Manual for A-profile architecture: + // + // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory + // are treated as a pair of single - copy atomic 64 - bit writes. + // + // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null) + // + // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise + // we're already doing a load that has no atomicity guarantees. + isCombinedIndirAtomic = true; +#endif + } + + if (!isCombinedIndirAtomic) + { + return; + } + } + // Since we're merging two stores of the same type, the new type is twice wider. var_types oldType = ind->TypeGet(); var_types newType; @@ -8014,32 +8076,80 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) case TYP_SHORT: case TYP_USHORT: - newType = TYP_INT; // TYP_UINT is not legal in IR + newType = TYP_INT; break; #ifdef TARGET_64BIT case TYP_INT: newType = TYP_LONG; break; + +#if defined(FEATURE_HW_INTRINSICS) + case TYP_LONG: + case TYP_REF: + if (comp->IsBaselineSimdIsaSupported()) + { + // TLDR: we should be here only if one of the conditions is true: + // 1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag + // 2) ARM64: Data is at least 8-byte aligned + // 3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+ + // + newType = TYP_SIMD16; + if ((oldType == TYP_REF) && + (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0))) + { + // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.: + // + // arr[1] = "hello"; + // arr[0] = "world"; + // + // but we don't want to load managed references into SIMD registers (we can only do so + // when we can issue a nongc region for a block) + return; + } + break; + } + return; + +#if defined(TARGET_AMD64) + case TYP_SIMD16: + if (comp->getPreferredVectorByteLength() >= 32) + { + newType = TYP_SIMD32; + break; + } + return; + + case TYP_SIMD32: + if (comp->getPreferredVectorByteLength() >= 64) + { + newType = TYP_SIMD64; + break; + } + return; +#endif // TARGET_AMD64 +#endif // FEATURE_HW_INTRINSICS #endif // TARGET_64BIT // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to // be converted to TYP_INT/TYP_LONG for constant value. // - // TODO-CQ: - // 2 x LONG/REF -> SIMD16 - // 2 x SIMD16 -> SIMD32 - // 2 x SIMD32 -> SIMD64 - // - // where it's legal (e.g. SIMD is not atomic on x64) + // TYP_UINT and TYP_ULONG are not legal for GT_IND. // default: return; } + // We should not be here for stores requiring write barriers. + assert(!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind)); + assert(!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd)); + // Delete previous STOREIND entirely BlockRange().Remove(std::move(prevIndRange)); + // It's not expected to be contained yet, but just in case... + ind->Data()->ClearContained(); + // We know it's always LEA for now GenTreeAddrMode* addr = ind->Addr()->AsAddrMode(); @@ -8050,8 +8160,29 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) ind->gtType = newType; ind->Data()->gtType = newType; - // We currently only support these constants for val - assert(prevData.value->IsCnsIntOrI() && currData.value->IsCnsIntOrI()); +#if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS) + // Upgrading two SIMD stores to a wider SIMD store. + // Only on x64 since ARM64 has no options above SIMD16 + if (varTypeIsSIMD(oldType)) + { + int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8; + int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8; + + // if the previous store was at a higher address, swap the constants + if (prevData.offset > currData.offset) + { + std::swap(lowerCns, upperCns); + } + + simd_t newCns = {}; + uint32_t oldWidth = genTypeSize(oldType); + memcpy(newCns.i8, lowerCns, oldWidth); + memcpy(newCns.i8 + oldWidth, upperCns, oldWidth); + + ind->Data()->AsVecCon()->gtSimdVal = newCns; + continue; + } +#endif size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue(); size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue(); @@ -8062,6 +8193,24 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) std::swap(lowerCns, upperCns); } +#if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS) + // We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16 + // All legality checks were done above. + if (varTypeIsSIMD(newType)) + { + // Replace two 64bit constants with a single 128bit constant + int8_t val[16]; + memcpy(val, &lowerCns, 8); + memcpy(val + 8, &upperCns, 8); + GenTreeVecCon* vecCns = comp->gtNewVconNode(newType, &val); + + BlockRange().InsertAfter(ind->Data(), vecCns); + BlockRange().Remove(ind->Data()); + ind->gtOp2 = vecCns; + continue; + } +#endif // TARGET_64BIT && FEATURE_HW_INTRINSICS + // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF. size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_IN_BYTE; @@ -8071,10 +8220,12 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_IN_BYTE))); JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val); - // It's not expected to be contained yet, but just in case... - ind->Data()->ClearContained(); ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val; - ind->gtFlags |= GTF_IND_UNALIGNED; + if (genTypeSize(oldType) == 1) + { + // A mark for future foldings that this IND doesn't need to be atomic. + ind->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC; + } } while (true); #endif // TARGET_XARCH || TARGET_ARM64