Skip to content

Commit

Permalink
Merged stores: Fix alignment-related issues and enable SIMD where pos…
Browse files Browse the repository at this point in the history
…sible (#92939)

Co-authored-by: SingleAccretion <62474226+SingleAccretion@users.noreply.github.com>
  • Loading branch information
EgorBo and SingleAccretion authored Oct 5, 2023
1 parent 28c6415 commit ce655e3
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 23 deletions.
1 change: 1 addition & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,7 @@ enum GenTreeFlags : unsigned int
GTF_IND_INVARIANT = 0x01000000, // GT_IND -- the target is invariant (a prejit indirection)
GTF_IND_NONNULL = 0x00400000, // GT_IND -- the indirection never returns null (zero)
GTF_IND_INITCLASS = 0x00200000, // OperIsIndir() -- the indirection requires preceding static cctor
GTF_IND_ALLOW_NON_ATOMIC = 0x00100000, // GT_IND -- this memory access does not need to be atomic

GTF_IND_COPYABLE_FLAGS = GTF_IND_VOLATILE | GTF_IND_NONFAULTING | GTF_IND_UNALIGNED | GTF_IND_INITCLASS,
GTF_IND_FLAGS = GTF_IND_COPYABLE_FLAGS | GTF_IND_NONNULL | GTF_IND_TGT_NOT_HEAP | GTF_IND_TGT_HEAP | GTF_IND_INVARIANT,
Expand Down
197 changes: 174 additions & 23 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7833,26 +7833,34 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo
}

// Data has to be INT_CNS, can be also VEC_CNS in future.
if (!ind->Data()->IsCnsIntOrI())
if (!ind->Data()->IsCnsIntOrI() && !ind->Data()->IsVectorConst())
{
return false;
}

auto isNodeInvariant = [](Compiler* comp, GenTree* node, bool allowNull) {
if (node == nullptr)
{
return allowNull;
}
// We can allow bigger trees here, but it's not clear if it's worth it.
return node->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
};

data->targetType = ind->TypeGet();
data->value = ind->Data();
if (ind->Addr()->OperIs(GT_LEA))
{
GenTree* base = ind->Addr()->AsAddrMode()->Base();
GenTree* index = ind->Addr()->AsAddrMode()->Index();
if ((base == nullptr) || !base->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(base->AsLclVar()->GetLclNum()))
if (!isNodeInvariant(comp, base, false))
{
// Base must be a local. It's possible for it to be nullptr when index is not null,
// but let's ignore such cases.
return false;
}

if ((index != nullptr) &&
(!index->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(index->AsLclVar()->GetLclNum())))
if (!isNodeInvariant(comp, index, true))
{
// Index should be either nullptr or a local.
return false;
Expand All @@ -7863,7 +7871,7 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo
data->scale = ind->Addr()->AsAddrMode()->GetScale();
data->offset = ind->Addr()->AsAddrMode()->Offset();
}
else if (ind->Addr()->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(ind->Addr()->AsLclVar()->GetLclNum()))
else if (isNodeInvariant(comp, ind->Addr(), true))
{
// Address is just a local, no offset, scale is 1
data->baseAddr = ind->Addr();
Expand Down Expand Up @@ -7919,6 +7927,15 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
return;
}

// TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial.
// The algorithm does support TYP_REF (with null value), but it seems to be not worth
// it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two
// items at once. Although, it may be profitable to do "stp q0, q0, [addr]".
if (!varTypeIsIntegral(ind) && !varTypeIsSIMD(ind))
{
return;
}

// We're going to do it in a loop while we see suitable STOREINDs to coalesce.
// E.g.: we have the following LIR sequence:
//
Expand All @@ -7933,12 +7950,6 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
// to get a single store of 8 bytes.
do
{
// This check is not really needed, just for better throughput.
if (!ind->TypeIs(TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT))
{
return;
}

StoreCoalescingData currData;
StoreCoalescingData prevData;

Expand Down Expand Up @@ -8002,6 +8013,57 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
return;
}

// Now the hardest part: decide whether it's safe to use an unaligned write.
//
// IND<byte> is always fine (and all IND<X> created here from such)
// IND<simd> is not required to be atomic per our Memory Model
const bool allowsNonAtomic =
((ind->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0) && ((prevInd->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0);

if (!allowsNonAtomic && (genTypeSize(ind) > 1) && !varTypeIsSIMD(ind))
{
// TODO-CQ: if we see that the target is a local memory (non address exposed)
// we can use any type (including SIMD) for a new load.

// Ignore indices for now, they can invalidate our alignment assumptions.
// Although, we can take scale into account.
if (currData.index != nullptr)
{
return;
}

// Base address being TYP_REF gives us a hint that data is pointer-aligned.
if (!currData.baseAddr->TypeIs(TYP_REF))
{
return;
}

// Check whether the combined indir is still aligned.
bool isCombinedIndirAtomic = (genTypeSize(ind) < TARGET_POINTER_SIZE) &&
(min(prevData.offset, currData.offset) % (genTypeSize(ind) * 2)) == 0;

if (genTypeSize(ind) == TARGET_POINTER_SIZE)
{
#ifdef TARGET_ARM64
// Per Arm Architecture Reference Manual for A-profile architecture:
//
// * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory
// are treated as a pair of single - copy atomic 64 - bit writes.
//
// Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null)
//
// And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise
// we're already doing a load that has no atomicity guarantees.
isCombinedIndirAtomic = true;
#endif
}

if (!isCombinedIndirAtomic)
{
return;
}
}

// Since we're merging two stores of the same type, the new type is twice wider.
var_types oldType = ind->TypeGet();
var_types newType;
Expand All @@ -8014,32 +8076,80 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)

case TYP_SHORT:
case TYP_USHORT:
newType = TYP_INT; // TYP_UINT is not legal in IR
newType = TYP_INT;
break;

#ifdef TARGET_64BIT
case TYP_INT:
newType = TYP_LONG;
break;

#if defined(FEATURE_HW_INTRINSICS)
case TYP_LONG:
case TYP_REF:
if (comp->IsBaselineSimdIsaSupported())
{
// TLDR: we should be here only if one of the conditions is true:
// 1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag
// 2) ARM64: Data is at least 8-byte aligned
// 3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+
//
newType = TYP_SIMD16;
if ((oldType == TYP_REF) &&
(!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0)))
{
// For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.:
//
// arr[1] = "hello";
// arr[0] = "world";
//
// but we don't want to load managed references into SIMD registers (we can only do so
// when we can issue a nongc region for a block)
return;
}
break;
}
return;

#if defined(TARGET_AMD64)
case TYP_SIMD16:
if (comp->getPreferredVectorByteLength() >= 32)
{
newType = TYP_SIMD32;
break;
}
return;

case TYP_SIMD32:
if (comp->getPreferredVectorByteLength() >= 64)
{
newType = TYP_SIMD64;
break;
}
return;
#endif // TARGET_AMD64
#endif // FEATURE_HW_INTRINSICS
#endif // TARGET_64BIT

// TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
// be converted to TYP_INT/TYP_LONG for constant value.
//
// TODO-CQ:
// 2 x LONG/REF -> SIMD16
// 2 x SIMD16 -> SIMD32
// 2 x SIMD32 -> SIMD64
//
// where it's legal (e.g. SIMD is not atomic on x64)
// TYP_UINT and TYP_ULONG are not legal for GT_IND.
//
default:
return;
}

// We should not be here for stores requiring write barriers.
assert(!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind));
assert(!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd));

// Delete previous STOREIND entirely
BlockRange().Remove(std::move(prevIndRange));

// It's not expected to be contained yet, but just in case...
ind->Data()->ClearContained();

// We know it's always LEA for now
GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();

Expand All @@ -8050,8 +8160,29 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
ind->gtType = newType;
ind->Data()->gtType = newType;

// We currently only support these constants for val
assert(prevData.value->IsCnsIntOrI() && currData.value->IsCnsIntOrI());
#if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS)
// Upgrading two SIMD stores to a wider SIMD store.
// Only on x64 since ARM64 has no options above SIMD16
if (varTypeIsSIMD(oldType))
{
int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8;
int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8;

// if the previous store was at a higher address, swap the constants
if (prevData.offset > currData.offset)
{
std::swap(lowerCns, upperCns);
}

simd_t newCns = {};
uint32_t oldWidth = genTypeSize(oldType);
memcpy(newCns.i8, lowerCns, oldWidth);
memcpy(newCns.i8 + oldWidth, upperCns, oldWidth);

ind->Data()->AsVecCon()->gtSimdVal = newCns;
continue;
}
#endif

size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue();
size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue();
Expand All @@ -8062,6 +8193,24 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
std::swap(lowerCns, upperCns);
}

#if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS)
// We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16
// All legality checks were done above.
if (varTypeIsSIMD(newType))
{
// Replace two 64bit constants with a single 128bit constant
int8_t val[16];
memcpy(val, &lowerCns, 8);
memcpy(val + 8, &upperCns, 8);
GenTreeVecCon* vecCns = comp->gtNewVconNode(newType, &val);

BlockRange().InsertAfter(ind->Data(), vecCns);
BlockRange().Remove(ind->Data());
ind->gtOp2 = vecCns;
continue;
}
#endif // TARGET_64BIT && FEATURE_HW_INTRINSICS

// Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
// the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_IN_BYTE;
Expand All @@ -8071,10 +8220,12 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_IN_BYTE)));
JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);

// It's not expected to be contained yet, but just in case...
ind->Data()->ClearContained();
ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val;
ind->gtFlags |= GTF_IND_UNALIGNED;
if (genTypeSize(oldType) == 1)
{
// A mark for future foldings that this IND doesn't need to be atomic.
ind->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC;
}

} while (true);
#endif // TARGET_XARCH || TARGET_ARM64
Expand Down

0 comments on commit ce655e3

Please sign in to comment.