Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/coreclr/jit/abi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,13 @@ var_types ABIPassingSegment::GetRegisterType() const
#ifdef FEATURE_SIMD
case 16:
return TYP_SIMD16;
#endif
#ifdef TARGET_XARCH
case 32:
return TYP_SIMD32;
case 64:
return TYP_SIMD64;
#endif // TARGET_XARCH
#endif // FEATURE_SIMD
default:
assert(!"Unexpected size for floating point register");
return TYP_UNDEF;
Expand Down
15 changes: 12 additions & 3 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6063,7 +6063,7 @@ void CodeGen::genCall(GenTreeCall* call)
}
else
#endif // TARGET_X86
if (varTypeIsFloating(returnType))
if (varTypeUsesFloatReg(returnType))
{
returnReg = REG_FLOATRET;
}
Expand Down Expand Up @@ -6158,7 +6158,7 @@ void CodeGen::genCallInstruction(GenTreeCall* call X86_ARG(target_ssize_t stackA
}
else
{
assert(!varTypeIsStruct(call));
assert(!varTypeIsStruct(call) || varTypeIsSIMD(call));

if (call->TypeIs(TYP_REF))
{
Expand All @@ -6168,6 +6168,10 @@ void CodeGen::genCallInstruction(GenTreeCall* call X86_ARG(target_ssize_t stackA
{
params.retSize = EA_BYREF;
}
else if (varTypeIsSIMD(call))
{
params.retSize = emitTypeSize(call->TypeGet());
}
}
}

Expand Down Expand Up @@ -11488,7 +11492,12 @@ void CodeGen::genClearAvxStateInEpilog()
// VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX
// register) and before any call to an unknown function.

instGen(INS_vzeroupper);
// Skip vzeroupper when the method returns a 256-bit or wider SIMD value in a register,
// as vzeroupper would destroy the upper bits of the return value.
if (genTypeSize(m_compiler->info.compRetNativeType) <= 16)
{
instGen(INS_vzeroupper);
}
}
}

Expand Down
30 changes: 30 additions & 0 deletions src/coreclr/jit/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,36 @@ var_types Compiler::getReturnTypeForStruct(CORINFO_CLASS_HANDLE clsHnd,
#endif

#ifdef UNIX_AMD64_ABI
// Opaque SIMD types (Vector64/128/256/512) should be returned in a single
// vector register (XMM/YMM/ZMM), matching the argument passing convention.
// We must handle this before the SysV struct classifier because the classifier
// produces [SSE, SSE] for 16-byte types (no SSEUP support), which splits the
// return into xmm0:xmm1 and adds unnecessary shuffling overhead.
// Use isHWSIMDClass (namespace check) instead of isOpaqueSIMDType (handle cache)
// because the SIMD handle cache may not be initialized this early in compilation.
if (isHWSIMDClass(clsHnd))
{
var_types simdType = getSIMDTypeForSize(structSize);
if (simdType != TYP_UNDEF && simdType != TYP_SIMD12)
{
bool canReturnAsSingleSimd = (structSize <= 16) || (structSize == 32 && canUseVexEncoding()) ||
(structSize == 64 && canUseEvexEncoding());

if (canReturnAsSingleSimd)
{
howToReturnStruct = SPK_PrimitiveType;
useType = simdType;

if (wbReturnStruct != nullptr)
{
*wbReturnStruct = howToReturnStruct;
}

return useType;
}
}
}

// An 8-byte struct may need to be returned in a floating point register
// So we always consult the struct "Classifier" routine
//
Expand Down
8 changes: 8 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9526,6 +9526,14 @@ class Compiler
int getSIMDTypeAlignment(var_types simdType);

public:
// Returns true if the given SIMD type should be passed in a single vector register
// for ABI purposes. True for opaque SIMD types like Vector64/128/256/512 but false
// for decomposable types like Plane, Quaternion, Vector2, Vector3, Vector4.
bool isSingleRegisterSIMDType(var_types type, ClassLayout* layout) const
{
return varTypeIsSIMD(type) && (type != TYP_SIMD12) && (layout != nullptr) && isOpaqueSIMDType(layout);
}

// Get the number of bytes in a System.Numeric.Vector<T> for the current compilation.
// Note - cannot be used for System.Runtime.Intrinsic
uint32_t getVectorTByteLength()
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/lsrabuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4297,13 +4297,13 @@ int LinearScan::BuildReturn(GenTree* tree)
SingleTypeRegSet useCandidates = RBM_NONE;

#if FEATURE_MULTIREG_RET
#ifdef TARGET_ARM64
#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
if (varTypeIsSIMD(tree) && !op1->IsMultiRegLclVar())
{
BuildUse(op1, RBM_DOUBLERET.GetFloatRegSet());
return 1;
}
#endif // TARGET_ARM64
#endif // TARGET_ARM64 || TARGET_AMD64

if (varTypeIsStruct(tree))
{
Expand Down
79 changes: 53 additions & 26 deletions src/coreclr/jit/targetamd64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,47 +63,74 @@ ABIPassingInformation SysVX64Classifier::Classify(Compiler* comp,
ClassLayout* structLayout,
WellKnownArg wellKnownParam)
{
bool canEnreg = false;
bool canEnreg = false;
bool handleAsSingleSimd = false;
SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
if (varTypeIsStruct(type))

// SIMD vector types (Vector64, Vector128, Vector256, Vector512) correspond to
// native __m64/__m128/__m256/__m512 and should be passed in a single vector
// register (XMM/YMM/ZMM) when hardware supports it.
// Exclude SIMD12 (Vector3) and non-opaque SIMD types (Plane, Quaternion, etc.)
// which have decomposable fields and should use normal struct classification.
if (comp->isSingleRegisterSIMDType(type, structLayout))
{
comp->eeGetSystemVAmd64PassStructInRegisterDescriptor(structLayout->GetClassHandle(), &structDesc);
unsigned simdSize = genTypeSize(type);
if ((simdSize <= 16) || (simdSize == 32 && comp->canUseVexEncoding()) ||
(simdSize == 64 && comp->canUseEvexEncoding()))
{
handleAsSingleSimd = true;
canEnreg = (m_floatRegs.Count() > 0);
}
}

if (structDesc.passedInRegisters)
if (!handleAsSingleSimd)
{
if (varTypeIsStruct(type))
{
unsigned intRegCount = 0;
unsigned floatRegCount = 0;
comp->eeGetSystemVAmd64PassStructInRegisterDescriptor(structLayout->GetClassHandle(), &structDesc);

for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
if (structDesc.passedInRegisters)
{
if (structDesc.IsIntegralSlot(i))
{
intRegCount++;
}
else if (structDesc.IsSseSlot(i))
{
floatRegCount++;
}
else
unsigned intRegCount = 0;
unsigned floatRegCount = 0;

for (unsigned int i = 0; i < structDesc.eightByteCount; i++)
{
assert(!"Invalid eightbyte classification type.");
break;
if (structDesc.IsIntegralSlot(i))
{
intRegCount++;
}
else if (structDesc.IsSseSlot(i))
{
floatRegCount++;
}
else
{
assert(!"Invalid eightbyte classification type.");
break;
}
}
}

canEnreg = (intRegCount <= m_intRegs.Count()) && (floatRegCount <= m_floatRegs.Count());
canEnreg = (intRegCount <= m_intRegs.Count()) && (floatRegCount <= m_floatRegs.Count());
}
}
else
{
unsigned availRegs = varTypeUsesFloatArgReg(type) ? m_floatRegs.Count() : m_intRegs.Count();
canEnreg = availRegs > 0;
}
}
else
{
unsigned availRegs = varTypeUsesFloatArgReg(type) ? m_floatRegs.Count() : m_intRegs.Count();
canEnreg = availRegs > 0;
}

ABIPassingInformation info;
if (canEnreg)
{
if (varTypeIsStruct(type))
if (handleAsSingleSimd)
{
regNumber reg = m_floatRegs.Dequeue();
info = ABIPassingInformation::FromSegmentByValue(comp,
ABIPassingSegment::InRegister(reg, 0, genTypeSize(type)));
}
else if (varTypeIsStruct(type))
{
info = ABIPassingInformation(comp, structDesc.eightByteCount);

Expand Down
72 changes: 72 additions & 0 deletions src/coreclr/pal/inc/unixasmmacrosamd64.inc
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,78 @@ C_FUNC(\Name\()_End):

.endm

// Save upper 128 bits of YMM0-YMM7 and upper 256 bits of ZMM0-ZMM7.
// Uses g_avxSupported / g_avx512Supported runtime flags.
// ofs: base offset into extra locals area for the 384 bytes of save space.
// Clobbers r11.
.macro SAVE_UPPER_VECTOR_REGISTERS ofs

PREPARE_EXTERNAL_VAR g_avxSupported, r11
cmp byte ptr [r11], 0
je LOCAL_LABEL(SaveUpperVecDone_\@)

vextractf128 xmmword ptr [rsp + \ofs + 0x00], ymm0, 1
vextractf128 xmmword ptr [rsp + \ofs + 0x10], ymm1, 1
vextractf128 xmmword ptr [rsp + \ofs + 0x20], ymm2, 1
vextractf128 xmmword ptr [rsp + \ofs + 0x30], ymm3, 1
vextractf128 xmmword ptr [rsp + \ofs + 0x40], ymm4, 1
vextractf128 xmmword ptr [rsp + \ofs + 0x50], ymm5, 1
vextractf128 xmmword ptr [rsp + \ofs + 0x60], ymm6, 1
vextractf128 xmmword ptr [rsp + \ofs + 0x70], ymm7, 1

PREPARE_EXTERNAL_VAR g_avx512Supported, r11
cmp byte ptr [r11], 0
je LOCAL_LABEL(SaveUpperVecDone_\@)

vextractf64x4 ymmword ptr [rsp + \ofs + 0x80], zmm0, 1
vextractf64x4 ymmword ptr [rsp + \ofs + 0xA0], zmm1, 1
vextractf64x4 ymmword ptr [rsp + \ofs + 0xC0], zmm2, 1
vextractf64x4 ymmword ptr [rsp + \ofs + 0xE0], zmm3, 1
vextractf64x4 ymmword ptr [rsp + \ofs + 0x100], zmm4, 1
vextractf64x4 ymmword ptr [rsp + \ofs + 0x120], zmm5, 1
vextractf64x4 ymmword ptr [rsp + \ofs + 0x140], zmm6, 1
vextractf64x4 ymmword ptr [rsp + \ofs + 0x160], zmm7, 1

LOCAL_LABEL(SaveUpperVecDone_\@):

.endm

// Restore upper 128 bits of YMM0-YMM7 and upper 256 bits of ZMM0-ZMM7.
// Must be called AFTER RESTORE_FLOAT_ARGUMENT_REGISTERS (which restores XMM lower 128 bits).
// ofs: same base offset used in SAVE_UPPER_VECTOR_REGISTERS.
// Clobbers r11.
.macro RESTORE_UPPER_VECTOR_REGISTERS ofs

PREPARE_EXTERNAL_VAR g_avxSupported, r11
cmp byte ptr [r11], 0
je LOCAL_LABEL(RestoreUpperVecDone_\@)

vinsertf128 ymm0, ymm0, xmmword ptr [rsp + \ofs + 0x00], 1
vinsertf128 ymm1, ymm1, xmmword ptr [rsp + \ofs + 0x10], 1
vinsertf128 ymm2, ymm2, xmmword ptr [rsp + \ofs + 0x20], 1
vinsertf128 ymm3, ymm3, xmmword ptr [rsp + \ofs + 0x30], 1
vinsertf128 ymm4, ymm4, xmmword ptr [rsp + \ofs + 0x40], 1
vinsertf128 ymm5, ymm5, xmmword ptr [rsp + \ofs + 0x50], 1
vinsertf128 ymm6, ymm6, xmmword ptr [rsp + \ofs + 0x60], 1
vinsertf128 ymm7, ymm7, xmmword ptr [rsp + \ofs + 0x70], 1

PREPARE_EXTERNAL_VAR g_avx512Supported, r11
cmp byte ptr [r11], 0
je LOCAL_LABEL(RestoreUpperVecDone_\@)

vinsertf64x4 zmm0, zmm0, ymmword ptr [rsp + \ofs + 0x80], 1
vinsertf64x4 zmm1, zmm1, ymmword ptr [rsp + \ofs + 0xA0], 1
vinsertf64x4 zmm2, zmm2, ymmword ptr [rsp + \ofs + 0xC0], 1
vinsertf64x4 zmm3, zmm3, ymmword ptr [rsp + \ofs + 0xE0], 1
vinsertf64x4 zmm4, zmm4, ymmword ptr [rsp + \ofs + 0x100], 1
vinsertf64x4 zmm5, zmm5, ymmword ptr [rsp + \ofs + 0x120], 1
vinsertf64x4 zmm6, zmm6, ymmword ptr [rsp + \ofs + 0x140], 1
vinsertf64x4 zmm7, zmm7, ymmword ptr [rsp + \ofs + 0x160], 1

LOCAL_LABEL(RestoreUpperVecDone_\@):

.endm

// Stack layout:
//
// (stack parameters)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,18 +214,48 @@ private static bool ClassifyEightBytes(TypeDesc typeDesc,
return true;
}

// The SIMD and Int128 Intrinsic types are meant to be handled specially and should not be passed as struct registers
// SIMD Intrinsic types should be classified as SSE to pass in XMM registers.
// We must NOT fall through to normal field enumeration because the internal fields
// (e.g., Vector64<T>._00 is ulong) would classify as INTEGER instead of SSE.
if (typeDesc.IsIntrinsic)
{
InstantiatedType instantiatedType = typeDesc as InstantiatedType;
if (instantiatedType != null)
{
if (VectorFieldLayoutAlgorithm.IsVectorType(instantiatedType) ||
VectorOfTFieldLayoutAlgorithm.IsVectorOfTType(instantiatedType) ||
Int128FieldLayoutAlgorithm.IsIntegerType(instantiatedType))
if (Int128FieldLayoutAlgorithm.IsIntegerType(instantiatedType))
{
return false;
}

if (VectorFieldLayoutAlgorithm.IsVectorType(instantiatedType) ||
VectorOfTFieldLayoutAlgorithm.IsVectorOfTType(instantiatedType))
{
int structSize = typeDesc.GetElementSize().AsInt;

// The entry point (GetSystemVAmd64PassStructInRegisterDescriptor) already
// filters out types > CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS,
// so only Vector64 (8B) and Vector128 (16B) can reach here. Vector256/512
// are handled by the JIT's handleAsSingleSimd path and never consult this
// classification.
Debug.Assert(structSize <= CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS);

// Directly classify each 8-byte chunk as SSE.
for (int offset = 0; offset < structSize; offset += SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES)
{
int eightByteSize = Math.Min(SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES, structSize - offset);
int normalizedOffset = offset + startOffsetOfStruct;

helper.FieldClassifications[helper.CurrentUniqueOffsetField] = SystemVClassificationTypeSSE;
helper.FieldSizes[helper.CurrentUniqueOffsetField] = eightByteSize;
helper.FieldOffsets[helper.CurrentUniqueOffsetField] = normalizedOffset;
helper.CurrentUniqueOffsetField++;
}

helper.LargestFieldOffset = startOffsetOfStruct + structSize - Math.Min(structSize, SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES);

AssignClassifiedEightByteTypes(ref helper);
return true;
}
}
}

Expand Down
Loading
Loading