Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SVE: Added Load2xVectorAndUnzip, Load3xVectorAndUnzip, Load4xVectorAndUnzip APIs #102180

Merged
merged 24 commits into from
May 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
@@ -26594,6 +26594,9 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
case NI_Sve_LoadVectorUInt16ZeroExtendToUInt64:
case NI_Sve_LoadVectorUInt32ZeroExtendToInt64:
case NI_Sve_LoadVectorUInt32ZeroExtendToUInt64:
case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
addr = Op(2);
break;
#endif // TARGET_ARM64
@@ -27115,6 +27118,13 @@ ClassLayout* GenTreeHWIntrinsic::GetLayout(Compiler* compiler) const
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x4:
return compiler->typGetBlkLayout(64);

case NI_Sve_Load2xVectorAndUnzip:
return compiler->typGetBlkLayout(compiler->getVectorTByteLength() * 2);
case NI_Sve_Load3xVectorAndUnzip:
return compiler->typGetBlkLayout(compiler->getVectorTByteLength() * 3);
case NI_Sve_Load4xVectorAndUnzip:
return compiler->typGetBlkLayout(compiler->getVectorTByteLength() * 4);

#endif // TARGET_ARM64

default:
8 changes: 8 additions & 0 deletions src/coreclr/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
@@ -1603,6 +1603,14 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,
// Op1 input is a vector. HWInstrinsic requires a mask.
retNode->AsHWIntrinsic()->Op(1) = gtNewSimdConvertVectorToMaskNode(retType, op1, simdBaseJitType, simdSize);
}

if (HWIntrinsicInfo::IsMultiReg(intrinsic))
{
assert(HWIntrinsicInfo::IsExplicitMaskedOperation(retNode->AsHWIntrinsic()->GetHWIntrinsicId()));
assert(HWIntrinsicInfo::IsMultiReg(retNode->AsHWIntrinsic()->GetHWIntrinsicId()));
retNode =
impStoreMultiRegValueToVar(retNode, sig->retTypeSigClass DEBUGARG(CorInfoCallConvExtension::Managed));
}
}

if (retType != nodeRetType)
3 changes: 3 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
@@ -831,6 +831,7 @@ struct HWIntrinsicInfo
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
case NI_AdvSimd_LoadAndReplicateToVector64x2:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x2:
case NI_Sve_Load2xVectorAndUnzip:
return 2;

case NI_AdvSimd_LoadVector64x3AndUnzip:
@@ -841,6 +842,7 @@ struct HWIntrinsicInfo
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
case NI_AdvSimd_LoadAndReplicateToVector64x3:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x3:
case NI_Sve_Load3xVectorAndUnzip:
return 3;

case NI_AdvSimd_LoadVector64x4AndUnzip:
@@ -851,6 +853,7 @@ struct HWIntrinsicInfo
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
case NI_AdvSimd_LoadAndReplicateToVector64x4:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x4:
case NI_Sve_Load4xVectorAndUnzip:
return 4;
#endif

29 changes: 29 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
@@ -2296,6 +2296,35 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
retNode = impStoreMultiRegValueToVar(op1, sig->retTypeSigClass DEBUGARG(CorInfoCallConvExtension::Managed));
break;
}

case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
{
info.compNeedsConsecutiveRegisters = true;

assert(sig->numArgs == 2);

op2 = impPopStack().val;
op1 = impPopStack().val;

if (op2->OperIs(GT_CAST))
{
// Although the API specifies a pointer, if what we have is a BYREF, that's what
// we really want, so throw away the cast.
if (op2->gtGetOp1()->TypeGet() == TYP_BYREF)
{
op2 = op2->gtGetOp1();
}
}

assert(HWIntrinsicInfo::IsMultiReg(intrinsic));
assert(HWIntrinsicInfo::IsExplicitMaskedOperation(intrinsic));

retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize);
break;
}

case NI_AdvSimd_LoadAndInsertScalarVector64x2:
case NI_AdvSimd_LoadAndInsertScalarVector64x3:
case NI_AdvSimd_LoadAndInsertScalarVector64x4:
20 changes: 20 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
@@ -1356,6 +1356,26 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;

case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
{
#ifdef DEBUG
// Validates that consecutive registers were used properly.

assert(node->GetMultiRegCount(compiler) == (unsigned int)GetEmitter()->insGetSveReg1ListSize(ins));

regNumber argReg = targetReg;
for (unsigned int i = 0; i < node->GetMultiRegCount(compiler); i++)
{
assert(argReg == node->GetRegNumByIdx(i));
argReg = getNextSIMDRegWithWraparound(argReg);
}
#endif // DEBUG
GetEmitter()->emitIns_R_R_R_I(ins, emitSize, targetReg, op1Reg, op2Reg, 0, opt);
break;
}

case NI_Sve_StoreAndZipx2:
case NI_Sve_StoreAndZipx3:
case NI_Sve_StoreAndZipx4:
3 changes: 3 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
@@ -93,6 +93,9 @@ HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToUInt32,
HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToUInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendToInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendToUInt64, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, Load2xVectorAndUnzip, -1, 2, true, {INS_sve_ld2b, INS_sve_ld2b, INS_sve_ld2h, INS_sve_ld2h, INS_sve_ld2w, INS_sve_ld2w, INS_sve_ld2d, INS_sve_ld2d, INS_sve_ld2w, INS_sve_ld2d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_MultiReg|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Sve, Load3xVectorAndUnzip, -1, 2, true, {INS_sve_ld3b, INS_sve_ld3b, INS_sve_ld3h, INS_sve_ld3h, INS_sve_ld3w, INS_sve_ld3w, INS_sve_ld3d, INS_sve_ld3d, INS_sve_ld3w, INS_sve_ld3d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_MultiReg|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Sve, Load4xVectorAndUnzip, -1, 2, true, {INS_sve_ld4b, INS_sve_ld4b, INS_sve_ld4h, INS_sve_ld4h, INS_sve_ld4w, INS_sve_ld4w, INS_sve_ld4d, INS_sve_ld4d, INS_sve_ld4w, INS_sve_ld4d}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_MultiReg|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_NeedsConsecutiveRegisters|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Sve, Max, -1, -1, false, {INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_fmax, INS_sve_fmax}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MaxAcross, -1, -1, false, {INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_smaxv, INS_sve_umaxv, INS_sve_fmaxv, INS_sve_fmaxv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, MaxNumber, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmaxnm, INS_sve_fmaxnm}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
13 changes: 13 additions & 0 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
@@ -1780,6 +1780,19 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
break;
}

case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
{
assert(intrin.op1 != nullptr);
assert(intrin.op2 != nullptr);
assert(intrinsicTree->OperIsMemoryLoadOrStore());
srcCount += BuildAddrUses(intrin.op2);
BuildConsecutiveRegistersForDef(intrinsicTree, dstCount);
*pDstCount = dstCount;
break;
}

case NI_Sve_StoreAndZipx2:
case NI_Sve_StoreAndZipx3:
case NI_Sve_StoreAndZipx4:
Loading