diff --git a/src/coreclr/jit/abi.cpp b/src/coreclr/jit/abi.cpp index 4c6e7aa4c28f32..01fb2a16af7cc3 100644 --- a/src/coreclr/jit/abi.cpp +++ b/src/coreclr/jit/abi.cpp @@ -125,7 +125,13 @@ var_types ABIPassingSegment::GetRegisterType() const #ifdef FEATURE_SIMD case 16: return TYP_SIMD16; -#endif +#ifdef TARGET_XARCH + case 32: + return TYP_SIMD32; + case 64: + return TYP_SIMD64; +#endif // TARGET_XARCH +#endif // FEATURE_SIMD default: assert(!"Unexpected size for floating point register"); return TYP_UNDEF; diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 2a6b01d3fe0b01..b723707105ae3b 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -6063,7 +6063,7 @@ void CodeGen::genCall(GenTreeCall* call) } else #endif // TARGET_X86 - if (varTypeIsFloating(returnType)) + if (varTypeUsesFloatReg(returnType)) { returnReg = REG_FLOATRET; } @@ -6158,7 +6158,7 @@ void CodeGen::genCallInstruction(GenTreeCall* call X86_ARG(target_ssize_t stackA } else { - assert(!varTypeIsStruct(call)); + assert(!varTypeIsStruct(call) || varTypeIsSIMD(call)); if (call->TypeIs(TYP_REF)) { @@ -6168,6 +6168,10 @@ void CodeGen::genCallInstruction(GenTreeCall* call X86_ARG(target_ssize_t stackA { params.retSize = EA_BYREF; } + else if (varTypeIsSIMD(call)) + { + params.retSize = emitTypeSize(call->TypeGet()); + } } } @@ -11488,7 +11492,12 @@ void CodeGen::genClearAvxStateInEpilog() // VZEROUPPER before returning from any function that uses VEX (that does not produce a VEX // register) and before any call to an unknown function. - instGen(INS_vzeroupper); + // Skip vzeroupper when the method returns a 256-bit or wider SIMD value in a register, + // as vzeroupper would destroy the upper bits of the return value. + if (genTypeSize(m_compiler->info.compRetNativeType) <= 16) + { + instGen(INS_vzeroupper); + } } } diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 379c33ab5a5042..52c584b4b34861 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -793,6 +793,36 @@ var_types Compiler::getReturnTypeForStruct(CORINFO_CLASS_HANDLE clsHnd, #endif #ifdef UNIX_AMD64_ABI + // Opaque SIMD types (Vector64/128/256/512) should be returned in a single + // vector register (XMM/YMM/ZMM), matching the argument passing convention. + // We must handle this before the SysV struct classifier because the classifier + // produces [SSE, SSE] for 16-byte types (no SSEUP support), which splits the + // return into xmm0:xmm1 and adds unnecessary shuffling overhead. + // Use isHWSIMDClass (namespace check) instead of isOpaqueSIMDType (handle cache) + // because the SIMD handle cache may not be initialized this early in compilation. + if (isHWSIMDClass(clsHnd)) + { + var_types simdType = getSIMDTypeForSize(structSize); + if (simdType != TYP_UNDEF && simdType != TYP_SIMD12) + { + bool canReturnAsSingleSimd = (structSize <= 16) || (structSize == 32 && canUseVexEncoding()) || + (structSize == 64 && canUseEvexEncoding()); + + if (canReturnAsSingleSimd) + { + howToReturnStruct = SPK_PrimitiveType; + useType = simdType; + + if (wbReturnStruct != nullptr) + { + *wbReturnStruct = howToReturnStruct; + } + + return useType; + } + } + } + // An 8-byte struct may need to be returned in a floating point register // So we always consult the struct "Classifier" routine // diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index ff4bd66b0d9e92..04dcec9f0bfef8 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -9526,6 +9526,14 @@ class Compiler int getSIMDTypeAlignment(var_types simdType); public: + // Returns true if the given SIMD type should be passed in a single vector register + // for ABI purposes. True for opaque SIMD types like Vector64/128/256/512 but false + // for decomposable types like Plane, Quaternion, Vector2, Vector3, Vector4. + bool isSingleRegisterSIMDType(var_types type, ClassLayout* layout) const + { + return varTypeIsSIMD(type) && (type != TYP_SIMD12) && (layout != nullptr) && isOpaqueSIMDType(layout); + } + // Get the number of bytes in a System.Numeric.Vector for the current compilation. // Note - cannot be used for System.Runtime.Intrinsic uint32_t getVectorTByteLength() diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index ba199aafdeedea..d5a25e55f66b73 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -4297,13 +4297,13 @@ int LinearScan::BuildReturn(GenTree* tree) SingleTypeRegSet useCandidates = RBM_NONE; #if FEATURE_MULTIREG_RET -#ifdef TARGET_ARM64 +#if defined(TARGET_ARM64) || defined(TARGET_AMD64) if (varTypeIsSIMD(tree) && !op1->IsMultiRegLclVar()) { BuildUse(op1, RBM_DOUBLERET.GetFloatRegSet()); return 1; } -#endif // TARGET_ARM64 +#endif // TARGET_ARM64 || TARGET_AMD64 if (varTypeIsStruct(tree)) { diff --git a/src/coreclr/jit/targetamd64.cpp b/src/coreclr/jit/targetamd64.cpp index a9b6b5c0e5e2a5..fa28a8868b5f59 100644 --- a/src/coreclr/jit/targetamd64.cpp +++ b/src/coreclr/jit/targetamd64.cpp @@ -63,47 +63,74 @@ ABIPassingInformation SysVX64Classifier::Classify(Compiler* comp, ClassLayout* structLayout, WellKnownArg wellKnownParam) { - bool canEnreg = false; + bool canEnreg = false; + bool handleAsSingleSimd = false; SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc; - if (varTypeIsStruct(type)) + + // SIMD vector types (Vector64, Vector128, Vector256, Vector512) correspond to + // native __m64/__m128/__m256/__m512 and should be passed in a single vector + // register (XMM/YMM/ZMM) when hardware supports it. + // Exclude SIMD12 (Vector3) and non-opaque SIMD types (Plane, Quaternion, etc.) + // which have decomposable fields and should use normal struct classification. + if (comp->isSingleRegisterSIMDType(type, structLayout)) { - comp->eeGetSystemVAmd64PassStructInRegisterDescriptor(structLayout->GetClassHandle(), &structDesc); + unsigned simdSize = genTypeSize(type); + if ((simdSize <= 16) || (simdSize == 32 && comp->canUseVexEncoding()) || + (simdSize == 64 && comp->canUseEvexEncoding())) + { + handleAsSingleSimd = true; + canEnreg = (m_floatRegs.Count() > 0); + } + } - if (structDesc.passedInRegisters) + if (!handleAsSingleSimd) + { + if (varTypeIsStruct(type)) { - unsigned intRegCount = 0; - unsigned floatRegCount = 0; + comp->eeGetSystemVAmd64PassStructInRegisterDescriptor(structLayout->GetClassHandle(), &structDesc); - for (unsigned int i = 0; i < structDesc.eightByteCount; i++) + if (structDesc.passedInRegisters) { - if (structDesc.IsIntegralSlot(i)) - { - intRegCount++; - } - else if (structDesc.IsSseSlot(i)) - { - floatRegCount++; - } - else + unsigned intRegCount = 0; + unsigned floatRegCount = 0; + + for (unsigned int i = 0; i < structDesc.eightByteCount; i++) { - assert(!"Invalid eightbyte classification type."); - break; + if (structDesc.IsIntegralSlot(i)) + { + intRegCount++; + } + else if (structDesc.IsSseSlot(i)) + { + floatRegCount++; + } + else + { + assert(!"Invalid eightbyte classification type."); + break; + } } - } - canEnreg = (intRegCount <= m_intRegs.Count()) && (floatRegCount <= m_floatRegs.Count()); + canEnreg = (intRegCount <= m_intRegs.Count()) && (floatRegCount <= m_floatRegs.Count()); + } + } + else + { + unsigned availRegs = varTypeUsesFloatArgReg(type) ? m_floatRegs.Count() : m_intRegs.Count(); + canEnreg = availRegs > 0; } - } - else - { - unsigned availRegs = varTypeUsesFloatArgReg(type) ? m_floatRegs.Count() : m_intRegs.Count(); - canEnreg = availRegs > 0; } ABIPassingInformation info; if (canEnreg) { - if (varTypeIsStruct(type)) + if (handleAsSingleSimd) + { + regNumber reg = m_floatRegs.Dequeue(); + info = ABIPassingInformation::FromSegmentByValue(comp, + ABIPassingSegment::InRegister(reg, 0, genTypeSize(type))); + } + else if (varTypeIsStruct(type)) { info = ABIPassingInformation(comp, structDesc.eightByteCount); diff --git a/src/coreclr/pal/inc/unixasmmacrosamd64.inc b/src/coreclr/pal/inc/unixasmmacrosamd64.inc index 90c8947e754297..d02a145e78c696 100644 --- a/src/coreclr/pal/inc/unixasmmacrosamd64.inc +++ b/src/coreclr/pal/inc/unixasmmacrosamd64.inc @@ -259,6 +259,78 @@ C_FUNC(\Name\()_End): .endm +// Save upper 128 bits of YMM0-YMM7 and upper 256 bits of ZMM0-ZMM7. +// Uses g_avxSupported / g_avx512Supported runtime flags. +// ofs: base offset into extra locals area for the 384 bytes of save space. +// Clobbers r11. +.macro SAVE_UPPER_VECTOR_REGISTERS ofs + + PREPARE_EXTERNAL_VAR g_avxSupported, r11 + cmp byte ptr [r11], 0 + je LOCAL_LABEL(SaveUpperVecDone_\@) + + vextractf128 xmmword ptr [rsp + \ofs + 0x00], ymm0, 1 + vextractf128 xmmword ptr [rsp + \ofs + 0x10], ymm1, 1 + vextractf128 xmmword ptr [rsp + \ofs + 0x20], ymm2, 1 + vextractf128 xmmword ptr [rsp + \ofs + 0x30], ymm3, 1 + vextractf128 xmmword ptr [rsp + \ofs + 0x40], ymm4, 1 + vextractf128 xmmword ptr [rsp + \ofs + 0x50], ymm5, 1 + vextractf128 xmmword ptr [rsp + \ofs + 0x60], ymm6, 1 + vextractf128 xmmword ptr [rsp + \ofs + 0x70], ymm7, 1 + + PREPARE_EXTERNAL_VAR g_avx512Supported, r11 + cmp byte ptr [r11], 0 + je LOCAL_LABEL(SaveUpperVecDone_\@) + + vextractf64x4 ymmword ptr [rsp + \ofs + 0x80], zmm0, 1 + vextractf64x4 ymmword ptr [rsp + \ofs + 0xA0], zmm1, 1 + vextractf64x4 ymmword ptr [rsp + \ofs + 0xC0], zmm2, 1 + vextractf64x4 ymmword ptr [rsp + \ofs + 0xE0], zmm3, 1 + vextractf64x4 ymmword ptr [rsp + \ofs + 0x100], zmm4, 1 + vextractf64x4 ymmword ptr [rsp + \ofs + 0x120], zmm5, 1 + vextractf64x4 ymmword ptr [rsp + \ofs + 0x140], zmm6, 1 + vextractf64x4 ymmword ptr [rsp + \ofs + 0x160], zmm7, 1 + +LOCAL_LABEL(SaveUpperVecDone_\@): + +.endm + +// Restore upper 128 bits of YMM0-YMM7 and upper 256 bits of ZMM0-ZMM7. +// Must be called AFTER RESTORE_FLOAT_ARGUMENT_REGISTERS (which restores XMM lower 128 bits). +// ofs: same base offset used in SAVE_UPPER_VECTOR_REGISTERS. +// Clobbers r11. +.macro RESTORE_UPPER_VECTOR_REGISTERS ofs + + PREPARE_EXTERNAL_VAR g_avxSupported, r11 + cmp byte ptr [r11], 0 + je LOCAL_LABEL(RestoreUpperVecDone_\@) + + vinsertf128 ymm0, ymm0, xmmword ptr [rsp + \ofs + 0x00], 1 + vinsertf128 ymm1, ymm1, xmmword ptr [rsp + \ofs + 0x10], 1 + vinsertf128 ymm2, ymm2, xmmword ptr [rsp + \ofs + 0x20], 1 + vinsertf128 ymm3, ymm3, xmmword ptr [rsp + \ofs + 0x30], 1 + vinsertf128 ymm4, ymm4, xmmword ptr [rsp + \ofs + 0x40], 1 + vinsertf128 ymm5, ymm5, xmmword ptr [rsp + \ofs + 0x50], 1 + vinsertf128 ymm6, ymm6, xmmword ptr [rsp + \ofs + 0x60], 1 + vinsertf128 ymm7, ymm7, xmmword ptr [rsp + \ofs + 0x70], 1 + + PREPARE_EXTERNAL_VAR g_avx512Supported, r11 + cmp byte ptr [r11], 0 + je LOCAL_LABEL(RestoreUpperVecDone_\@) + + vinsertf64x4 zmm0, zmm0, ymmword ptr [rsp + \ofs + 0x80], 1 + vinsertf64x4 zmm1, zmm1, ymmword ptr [rsp + \ofs + 0xA0], 1 + vinsertf64x4 zmm2, zmm2, ymmword ptr [rsp + \ofs + 0xC0], 1 + vinsertf64x4 zmm3, zmm3, ymmword ptr [rsp + \ofs + 0xE0], 1 + vinsertf64x4 zmm4, zmm4, ymmword ptr [rsp + \ofs + 0x100], 1 + vinsertf64x4 zmm5, zmm5, ymmword ptr [rsp + \ofs + 0x120], 1 + vinsertf64x4 zmm6, zmm6, ymmword ptr [rsp + \ofs + 0x140], 1 + vinsertf64x4 zmm7, zmm7, ymmword ptr [rsp + \ofs + 0x160], 1 + +LOCAL_LABEL(RestoreUpperVecDone_\@): + +.endm + // Stack layout: // // (stack parameters) diff --git a/src/coreclr/tools/Common/JitInterface/SystemVStructClassificator.cs b/src/coreclr/tools/Common/JitInterface/SystemVStructClassificator.cs index 943f23ce102a4a..b0c32f62a37f82 100644 --- a/src/coreclr/tools/Common/JitInterface/SystemVStructClassificator.cs +++ b/src/coreclr/tools/Common/JitInterface/SystemVStructClassificator.cs @@ -214,18 +214,48 @@ private static bool ClassifyEightBytes(TypeDesc typeDesc, return true; } - // The SIMD and Int128 Intrinsic types are meant to be handled specially and should not be passed as struct registers + // SIMD Intrinsic types should be classified as SSE to pass in XMM registers. + // We must NOT fall through to normal field enumeration because the internal fields + // (e.g., Vector64._00 is ulong) would classify as INTEGER instead of SSE. if (typeDesc.IsIntrinsic) { InstantiatedType instantiatedType = typeDesc as InstantiatedType; if (instantiatedType != null) { - if (VectorFieldLayoutAlgorithm.IsVectorType(instantiatedType) || - VectorOfTFieldLayoutAlgorithm.IsVectorOfTType(instantiatedType) || - Int128FieldLayoutAlgorithm.IsIntegerType(instantiatedType)) + if (Int128FieldLayoutAlgorithm.IsIntegerType(instantiatedType)) { return false; } + + if (VectorFieldLayoutAlgorithm.IsVectorType(instantiatedType) || + VectorOfTFieldLayoutAlgorithm.IsVectorOfTType(instantiatedType)) + { + int structSize = typeDesc.GetElementSize().AsInt; + + // The entry point (GetSystemVAmd64PassStructInRegisterDescriptor) already + // filters out types > CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS, + // so only Vector64 (8B) and Vector128 (16B) can reach here. Vector256/512 + // are handled by the JIT's handleAsSingleSimd path and never consult this + // classification. + Debug.Assert(structSize <= CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS); + + // Directly classify each 8-byte chunk as SSE. + for (int offset = 0; offset < structSize; offset += SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES) + { + int eightByteSize = Math.Min(SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES, structSize - offset); + int normalizedOffset = offset + startOffsetOfStruct; + + helper.FieldClassifications[helper.CurrentUniqueOffsetField] = SystemVClassificationTypeSSE; + helper.FieldSizes[helper.CurrentUniqueOffsetField] = eightByteSize; + helper.FieldOffsets[helper.CurrentUniqueOffsetField] = normalizedOffset; + helper.CurrentUniqueOffsetField++; + } + + helper.LargestFieldOffset = startOffsetOfStruct + structSize - Math.Min(structSize, SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES); + + AssignClassifiedEightByteTypes(ref helper); + return true; + } } } diff --git a/src/coreclr/vm/amd64/CachedInterfaceDispatchCoreCLR.S b/src/coreclr/vm/amd64/CachedInterfaceDispatchCoreCLR.S index 9b2345591947d6..28d7889c2344dd 100644 --- a/src/coreclr/vm/amd64/CachedInterfaceDispatchCoreCLR.S +++ b/src/coreclr/vm/amd64/CachedInterfaceDispatchCoreCLR.S @@ -49,15 +49,26 @@ LEAF_END RhpVTableOffsetDispatch, _TEXT // [rsp+0] m_ReturnAddress: contains the return address of caller to stub NESTED_ENTRY RhpInterfaceDispatchSlow, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PROLOG_WITH_TRANSITION_BLOCK 384 + + // r11 holds indirection cell from the caller. Save before clobbering. + mov r10, r11 + + SAVE_UPPER_VECTOR_REGISTERS 0x00 lea rdi, [rsp + __PWTB_TransitionBlock] // pTransitionBlock - mov rsi, r11 // indirection cell + mov rsi, r10 // indirection cell call C_FUNC(CID_ResolveWorker) + mov r10, rax - EPILOG_WITH_TRANSITION_BLOCK_TAILCALL - TAILJMP_RAX + RESTORE_FLOAT_ARGUMENT_REGISTERS __PWTB_FloatArgumentRegisters + RESTORE_UPPER_VECTOR_REGISTERS 0x00 + + free_stack __PWTB_StackAlloc + POP_ARGUMENT_REGISTERS + POP_CALLEE_SAVED_REGISTERS + jmp r10 NESTED_END RhpInterfaceDispatchSlow, _TEXT @@ -65,15 +76,26 @@ NESTED_END RhpInterfaceDispatchSlow, _TEXT // r11 contains the address of the indirection cell (which is the MethodPtrAux field of the delegate) NESTED_ENTRY CID_VirtualOpenDelegateDispatch, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PROLOG_WITH_TRANSITION_BLOCK 384 + + // r11 holds indirection cell from the caller. Save before clobbering. + mov r10, r11 + + SAVE_UPPER_VECTOR_REGISTERS 0x00 lea rdi, [rsp + __PWTB_TransitionBlock] // pTransitionBlock - mov rsi, r11 // indirection cell + mov rsi, r10 // indirection cell call C_FUNC(CID_VirtualOpenDelegateDispatchWorker) + mov r10, rax - EPILOG_WITH_TRANSITION_BLOCK_TAILCALL - TAILJMP_RAX + RESTORE_FLOAT_ARGUMENT_REGISTERS __PWTB_FloatArgumentRegisters + RESTORE_UPPER_VECTOR_REGISTERS 0x00 + + free_stack __PWTB_StackAlloc + POP_ARGUMENT_REGISTERS + POP_CALLEE_SAVED_REGISTERS + jmp r10 NESTED_END CID_VirtualOpenDelegateDispatch, _TEXT diff --git a/src/coreclr/vm/amd64/externalmethodfixupthunk.S b/src/coreclr/vm/amd64/externalmethodfixupthunk.S index f6b89f1b98f065..9be7cdb64e4d6d 100644 --- a/src/coreclr/vm/amd64/externalmethodfixupthunk.S +++ b/src/coreclr/vm/amd64/externalmethodfixupthunk.S @@ -9,16 +9,23 @@ NESTED_ENTRY DelayLoad_MethodCall, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK 0, 0x10, rdx, rcx, 0 + PROLOG_WITH_TRANSITION_BLOCK 384, 0x10, rdx, rcx, 0 + + SAVE_UPPER_VECTOR_REGISTERS 0x00 lea rdi, [rsp + __PWTB_TransitionBlock] // pTransitionBlock mov rsi, rax // pIndirection call C_FUNC(ExternalMethodFixupWorker) + mov r10, rax - EPILOG_WITH_TRANSITION_BLOCK_TAILCALL + RESTORE_FLOAT_ARGUMENT_REGISTERS __PWTB_FloatArgumentRegisters + RESTORE_UPPER_VECTOR_REGISTERS 0x00 - TAILJMP_RAX + free_stack __PWTB_StackAlloc + POP_ARGUMENT_REGISTERS + POP_CALLEE_SAVED_REGISTERS + jmp r10 NESTED_END DelayLoad_MethodCall, _TEXT diff --git a/src/coreclr/vm/amd64/theprestubamd64.S b/src/coreclr/vm/amd64/theprestubamd64.S index 8d601c0ab9f280..e2ac738a496d0b 100644 --- a/src/coreclr/vm/amd64/theprestubamd64.S +++ b/src/coreclr/vm/amd64/theprestubamd64.S @@ -6,9 +6,12 @@ #include "asmconstants.h" NESTED_ENTRY ThePreStub, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK 8, 0, 0, 0, 0 + // Extra locals: 8 (Swift return buffer) + 384 (upper vector halves) + PROLOG_WITH_TRANSITION_BLOCK 392, 0, 0, 0, 0 mov [rsp], rax // Return buffer in Swift calling convention + SAVE_UPPER_VECTOR_REGISTERS 0x08 + // // call PreStubWorker // @@ -18,7 +21,13 @@ NESTED_ENTRY ThePreStub, _TEXT, NoHandler mov r10, rax mov rax, [rsp] - EPILOG_WITH_TRANSITION_BLOCK_TAILCALL + + RESTORE_FLOAT_ARGUMENT_REGISTERS __PWTB_FloatArgumentRegisters + RESTORE_UPPER_VECTOR_REGISTERS 0x08 + + free_stack __PWTB_StackAlloc + POP_ARGUMENT_REGISTERS + POP_CALLEE_SAVED_REGISTERS jmp r10 NESTED_END ThePreStub, _TEXT diff --git a/src/coreclr/vm/amd64/unixasmhelpers.S b/src/coreclr/vm/amd64/unixasmhelpers.S index 53115def14a26b..43321e9baacdc2 100644 --- a/src/coreclr/vm/amd64/unixasmhelpers.S +++ b/src/coreclr/vm/amd64/unixasmhelpers.S @@ -136,14 +136,22 @@ NESTED_END OnHijackTripThread, _TEXT #ifdef FEATURE_TIERED_COMPILATION NESTED_ENTRY OnCallCountThresholdReachedStub, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK + PROLOG_WITH_TRANSITION_BLOCK 384 + + SAVE_UPPER_VECTOR_REGISTERS 0x00 lea rdi, [rsp + __PWTB_TransitionBlock] // TransitionBlock * mov rsi, rax // stub-identifying token, see OnCallCountThresholdReachedStub call C_FUNC(OnCallCountThresholdReached) + mov r10, rax - EPILOG_WITH_TRANSITION_BLOCK_TAILCALL - TAILJMP_RAX + RESTORE_FLOAT_ARGUMENT_REGISTERS __PWTB_FloatArgumentRegisters + RESTORE_UPPER_VECTOR_REGISTERS 0x00 + + free_stack __PWTB_StackAlloc + POP_ARGUMENT_REGISTERS + POP_CALLEE_SAVED_REGISTERS + jmp r10 NESTED_END OnCallCountThresholdReachedStub, _TEXT NESTED_ENTRY JIT_Patchpoint, _TEXT, NoHandler diff --git a/src/coreclr/vm/amd64/virtualcallstubamd64.S b/src/coreclr/vm/amd64/virtualcallstubamd64.S index 822eaaf2718f6e..581ba97cce3ce7 100644 --- a/src/coreclr/vm/amd64/virtualcallstubamd64.S +++ b/src/coreclr/vm/amd64/virtualcallstubamd64.S @@ -26,20 +26,32 @@ NESTED_ENTRY ResolveWorkerAsmStub, _TEXT, NoHandler - PROLOG_WITH_TRANSITION_BLOCK 0, 8, rdx, 0, 0 + PROLOG_WITH_TRANSITION_BLOCK 384, 8, rdx, 0, 0 + + // r11 holds indirection cell + flags from the caller. Save it before + // SAVE_UPPER_VECTOR_REGISTERS clobbers r11 as a scratch register. + mov r10, r11 + + SAVE_UPPER_VECTOR_REGISTERS 0x00 // token stored in rdx by prolog lea rdi, [rsp + __PWTB_TransitionBlock] // pTransitionBlock - mov rsi, r11 // indirection cell + flags + mov rsi, r10 // indirection cell + flags mov rcx, rsi and rcx, 7 // flags sub rsi, rcx // indirection cell call C_FUNC(VSD_ResolveWorker) + mov r10, rax + + RESTORE_FLOAT_ARGUMENT_REGISTERS __PWTB_FloatArgumentRegisters + RESTORE_UPPER_VECTOR_REGISTERS 0x00 - EPILOG_WITH_TRANSITION_BLOCK_TAILCALL - TAILJMP_RAX + free_stack __PWTB_StackAlloc + POP_ARGUMENT_REGISTERS + POP_CALLEE_SAVED_REGISTERS + jmp r10 NESTED_END ResolveWorkerAsmStub, _TEXT diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp index a1bf0bc4c6bfa2..9aabcd273c494d 100644 --- a/src/coreclr/vm/codeman.cpp +++ b/src/coreclr/vm/codeman.cpp @@ -1271,6 +1271,9 @@ void EEJitManager::SetCpuInfo() if (((cpuFeatures & XArchIntrinsicConstants_Avx) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX)) { CPUCompileFlags.Set(InstructionSet_AVX); +#ifdef UNIX_AMD64_ABI + g_avxSupported = 1; +#endif } if (((cpuFeatures & XArchIntrinsicConstants_Avx2) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX2)) @@ -1283,6 +1286,9 @@ void EEJitManager::SetCpuInfo() if (((cpuFeatures & XArchIntrinsicConstants_Avx512) != 0) && CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableAVX512)) { CPUCompileFlags.Set(InstructionSet_AVX512); +#ifdef UNIX_AMD64_ABI + g_avx512Supported = 1; +#endif } // x86-64-vFuture diff --git a/src/coreclr/vm/methodtable.cpp b/src/coreclr/vm/methodtable.cpp index 470d35b23cbbfc..5d0885d11ff49d 100644 --- a/src/coreclr/vm/methodtable.cpp +++ b/src/coreclr/vm/methodtable.cpp @@ -2131,29 +2131,57 @@ bool MethodTable::ClassifyEightBytesWithManagedLayout(SystemVStructRegisterPassi return true; } - // The SIMD Intrinsic types are meant to be handled specially and should not be passed as struct registers + // SIMD Intrinsic types should be classified as SSE to pass in XMM registers. + // We must NOT fall through to normal field enumeration because the internal fields + // (e.g., Vector64._00 is ulong) would classify as INTEGER instead of SSE. if (IsIntrinsicType()) { LPCUTF8 namespaceName; LPCUTF8 className = GetFullyQualifiedNameInfo(&namespaceName); + bool isSIMDType = false; + unsigned structSize = 0; + if ((strcmp(className, "Vector512`1") == 0) || (strcmp(className, "Vector256`1") == 0) || (strcmp(className, "Vector128`1") == 0) || (strcmp(className, "Vector64`1") == 0)) { assert(strcmp(namespaceName, "System.Runtime.Intrinsics") == 0); - - LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytesWithManagedLayout: struct %s is a SIMD intrinsic type; will not be enregistered\n", - nestingLevel * 5, "", this->GetDebugClassName())); - - return false; + structSize = GetNumInstanceFieldBytes(); + isSIMDType = true; + } + else if ((strcmp(className, "Vector`1") == 0) && (strcmp(namespaceName, "System.Numerics") == 0)) + { + structSize = GetNumInstanceFieldBytes(); + isSIMDType = true; } - if ((strcmp(className, "Vector`1") == 0) && (strcmp(namespaceName, "System.Numerics") == 0)) + if (isSIMDType) { - LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytesWithManagedLayout: struct %s is a SIMD intrinsic type; will not be enregistered\n", - nestingLevel * 5, "", this->GetDebugClassName())); + // All callers (SystemVAmd64CheckForPassStructInRegister, ClassifyEightBytes) + // already filter out structs > CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS (16 bytes). + // Only Vector64 (8B) and Vector128 (16B) can reach here. Vector256/512 are handled + // by the JIT's handleAsSingleSimd path and never consult this classification. + _ASSERTE(structSize <= CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS); - return false; + // Directly classify each 8-byte chunk as SSE so the JIT passes them in XMM registers. + for (unsigned offset = 0; offset < structSize; offset += SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES) + { + unsigned eightByteSize = min(static_cast(SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES), structSize - offset); + unsigned normalizedOffset = offset + startOffsetOfStruct; + + helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] = SystemVClassificationTypeSSE; + helperPtr->fieldSizes[helperPtr->currentUniqueOffsetField] = eightByteSize; + helperPtr->fieldOffsets[helperPtr->currentUniqueOffsetField] = normalizedOffset; + helperPtr->currentUniqueOffsetField++; + + LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytesWithManagedLayout: SIMD type %s eightbyte at offset %u classified as SSE\n", + nestingLevel * 5, "", this->GetDebugClassName(), normalizedOffset)); + } + + helperPtr->largestFieldOffset = (int)(startOffsetOfStruct + structSize - min(structSize, static_cast(SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES))); + + AssignClassifiedEightByteTypes(helperPtr, nestingLevel); + return true; } } @@ -2359,29 +2387,57 @@ bool MethodTable::ClassifyEightBytesWithNativeLayout(SystemVStructRegisterPassin numIntroducedFields = pNativeLayoutInfo->GetSize() / pNativeFieldDescs->NativeSize(); } - // The SIMD Intrinsic types are meant to be handled specially and should not be passed as struct registers + // SIMD Intrinsic types should be classified as SSE to pass in XMM registers. + // We must NOT fall through to normal field enumeration because the internal fields + // (e.g., Vector64._00 is ulong) would classify as INTEGER instead of SSE. if (IsIntrinsicType()) { LPCUTF8 namespaceName; LPCUTF8 className = GetFullyQualifiedNameInfo(&namespaceName); + bool isSIMDType = false; + unsigned structSize = 0; + if ((strcmp(className, "Vector512`1") == 0) || (strcmp(className, "Vector256`1") == 0) || (strcmp(className, "Vector128`1") == 0) || (strcmp(className, "Vector64`1") == 0)) { assert(strcmp(namespaceName, "System.Runtime.Intrinsics") == 0); - - LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytesWithNativeLayout: struct %s is a SIMD intrinsic type; will not be enregistered\n", - nestingLevel * 5, "", this->GetDebugClassName())); - - return false; + structSize = pNativeLayoutInfo->GetSize(); + isSIMDType = true; + } + else if ((strcmp(className, "Vector`1") == 0) && (strcmp(namespaceName, "System.Numerics") == 0)) + { + structSize = pNativeLayoutInfo->GetSize(); + isSIMDType = true; } - if ((strcmp(className, "Vector`1") == 0) && (strcmp(namespaceName, "System.Numerics") == 0)) + if (isSIMDType) { - LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytesWithNativeLayout: struct %s is a SIMD intrinsic type; will not be enregistered\n", - nestingLevel * 5, "", this->GetDebugClassName())); + // All callers (SystemVAmd64CheckForPassNativeStructInRegister) already filter out + // structs > CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS (16 bytes). + // Only Vector64 (8B) and Vector128 (16B) can reach here. Vector256/512 are handled + // by the JIT's handleAsSingleSimd path and never consult this classification. + _ASSERTE(structSize <= CLR_SYSTEMV_MAX_STRUCT_BYTES_TO_PASS_IN_REGISTERS); - return false; + // Directly classify each 8-byte chunk as SSE so the JIT passes them in XMM registers. + for (unsigned offset = 0; offset < structSize; offset += SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES) + { + unsigned eightByteSize = min(static_cast(SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES), structSize - offset); + unsigned normalizedOffset = offset + startOffsetOfStruct; + + helperPtr->fieldClassifications[helperPtr->currentUniqueOffsetField] = SystemVClassificationTypeSSE; + helperPtr->fieldSizes[helperPtr->currentUniqueOffsetField] = eightByteSize; + helperPtr->fieldOffsets[helperPtr->currentUniqueOffsetField] = normalizedOffset; + helperPtr->currentUniqueOffsetField++; + + LOG((LF_JIT, LL_EVERYTHING, "%*s**** ClassifyEightBytesWithNativeLayout: SIMD type %s eightbyte at offset %u classified as SSE\n", + nestingLevel * 5, "", this->GetDebugClassName(), normalizedOffset)); + } + + helperPtr->largestFieldOffset = (int)(startOffsetOfStruct + structSize - min(structSize, static_cast(SYSTEMV_EIGHT_BYTE_SIZE_IN_BYTES))); + + AssignClassifiedEightByteTypes(helperPtr, nestingLevel); + return true; } } diff --git a/src/coreclr/vm/vars.cpp b/src/coreclr/vm/vars.cpp index f6fd1eac774627..678a1989cf54c9 100644 --- a/src/coreclr/vm/vars.cpp +++ b/src/coreclr/vm/vars.cpp @@ -101,6 +101,11 @@ GVAL_IMPL_INIT(DWORD, g_TlsIndex, TLS_OUT_OF_INDEXES); GVAL_IMPL_INIT(DWORD, g_offsetOfCurrentThreadInfo, 0); GVAL_IMPL_INIT(DWORD, g_gcNotificationFlags, 0); +#if defined(TARGET_AMD64) && defined(UNIX_AMD64_ABI) +extern "C" uint8_t g_avxSupported = 0; +extern "C" uint8_t g_avx512Supported = 0; +#endif + MethodTable* g_pCastHelpers; diff --git a/src/coreclr/vm/vars.hpp b/src/coreclr/vm/vars.hpp index 246291fdeab900..b310e8d51d981f 100644 --- a/src/coreclr/vm/vars.hpp +++ b/src/coreclr/vm/vars.hpp @@ -365,6 +365,11 @@ GVAL_DECL(DWORD, g_TlsIndex); GVAL_DECL(DWORD, g_offsetOfCurrentThreadInfo); GVAL_DECL(DWORD, g_gcNotificationFlags); +#if defined(TARGET_AMD64) && defined(UNIX_AMD64_ABI) +extern "C" uint8_t g_avxSupported; +extern "C" uint8_t g_avx512Supported; +#endif + GPTR_DECL(MethodTable, g_pEHClass); GPTR_DECL(MethodTable, g_pExceptionServicesInternalCallsClass); GPTR_DECL(MethodTable, g_pStackFrameIteratorClass); diff --git a/src/tests/JIT/Directed/VectorABI/VectorMgdMgd256.cs b/src/tests/JIT/Directed/VectorABI/VectorMgdMgd256.cs new file mode 100644 index 00000000000000..77e20531c1b113 --- /dev/null +++ b/src/tests/JIT/Directed/VectorABI/VectorMgdMgd256.cs @@ -0,0 +1,266 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Runtime.CompilerServices; +using Xunit; + +// Test passing and returning HVA (homogeneous vector aggregate) structs containing +// Vector256 and Vector512 elements. +// +// On System V x64: +// - A single Vector256/512 is passed in a single YMM/ZMM register via handleAsSingleSimd. +// - HVA structs containing multiple vectors are larger than 16 bytes and are passed on the stack +// per the System V ABI (structs > 2 eightbytes go on the stack). +// - This test verifies that both single-vector and multi-vector HVAs are handled correctly, +// and that values are not corrupted during argument/return value passing. + +public static class VectorMgdMgd256 +{ + private const int PASS = 100; + private const int FAIL = 0; + + public const int DefaultSeed = 20010415; + public static int Seed = Environment.GetEnvironmentVariable("CORECLR_SEED") switch + { + string seedStr when seedStr.Equals("random", StringComparison.OrdinalIgnoreCase) => new Random().Next(), + string seedStr when int.TryParse(seedStr, out int envSeed) => envSeed, + _ => DefaultSeed + }; + + static Random random = new Random(Seed); + + static bool isPassing = true; + + static void Check(string msg, bool condition) + { + if (!condition) + { + Console.WriteLine($"FAIL: {msg}"); + isPassing = false; + } + } + + // ======== HVA structs with Vector256 ======== + + public struct HVA256_01 { public Vector256 v0; } + public struct HVA256_02 { public Vector256 v0; public Vector256 v1; } + public struct HVA256_03 { public Vector256 v0; public Vector256 v1; public Vector256 v2; } + + // ======== HVA structs with Vector512 ======== + + public struct HVA512_01 { public Vector512 v0; } + public struct HVA512_02 { public Vector512 v0; public Vector512 v1; } + + // ======== Single Vector256/512 argument tests ======== + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 PassSingle256(Vector256 a) => a; + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 Add256(Vector256 a, Vector256 b) => a + b; + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 PassMany256(Vector256 a, Vector256 b, Vector256 c, Vector256 d) + => a + b + c + d; + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 Mixed256(int x, Vector256 v, long y) + => v + Vector256.Create(x + (int)y); + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector512 PassSingle512(Vector512 a) => a; + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector512 Add512(Vector512 a, Vector512 b) => a + b; + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector512 Mixed512(int x, Vector512 v, long y) + => v + Vector512.Create(x + (int)y); + + // ======== HVA argument tests (passed on stack on SysV x64) ======== + + [MethodImpl(MethodImplOptions.NoInlining)] + static HVA256_01 PassHVA256_01(HVA256_01 h) => h; + + [MethodImpl(MethodImplOptions.NoInlining)] + static HVA256_02 PassHVA256_02(HVA256_02 h) => h; + + [MethodImpl(MethodImplOptions.NoInlining)] + static HVA256_03 PassHVA256_03(HVA256_03 h) => h; + + [MethodImpl(MethodImplOptions.NoInlining)] + static HVA512_01 PassHVA512_01(HVA512_01 h) => h; + + [MethodImpl(MethodImplOptions.NoInlining)] + static HVA512_02 PassHVA512_02(HVA512_02 h) => h; + + // ======== HVA return tests ======== + + [MethodImpl(MethodImplOptions.NoInlining)] + static HVA256_01 ReturnHVA256_01(Vector256 v0) => new HVA256_01 { v0 = v0 }; + + [MethodImpl(MethodImplOptions.NoInlining)] + static HVA256_02 ReturnHVA256_02(Vector256 v0, Vector256 v1) => new HVA256_02 { v0 = v0, v1 = v1 }; + + [MethodImpl(MethodImplOptions.NoInlining)] + static HVA512_01 ReturnHVA512_01(Vector512 v0) => new HVA512_01 { v0 = v0 }; + + // ======== Mixed HVA + scalar argument tests ======== + + [MethodImpl(MethodImplOptions.NoInlining)] + static int HVA256WithScalars(int a, HVA256_01 h, int b) + => a + b + h.v0.GetElement(0); + + [MethodImpl(MethodImplOptions.NoInlining)] + static int HVA512WithScalars(long a, HVA512_01 h, long b) + => (int)(a + b) + h.v0.GetElement(0); + + // ======== Reflection tests (forces real calling convention) ======== + + static void TestReflection256() + { + var v = Vector256.Create(42); + var h01 = new HVA256_01 { v0 = v }; + + var method = typeof(VectorMgdMgd256).GetMethod(nameof(PassHVA256_01), + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); + var result = (HVA256_01)method.Invoke(null, new object[] { h01 }); + Check("Reflection PassHVA256_01.v0", result.v0 == v); + + var h02 = new HVA256_02 { v0 = Vector256.Create(10), v1 = Vector256.Create(20) }; + var method2 = typeof(VectorMgdMgd256).GetMethod(nameof(PassHVA256_02), + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); + var result2 = (HVA256_02)method2.Invoke(null, new object[] { h02 }); + Check("Reflection PassHVA256_02.v0", result2.v0 == Vector256.Create(10)); + Check("Reflection PassHVA256_02.v1", result2.v1 == Vector256.Create(20)); + } + + static void TestReflection512() + { + var v = Vector512.Create(42); + var h01 = new HVA512_01 { v0 = v }; + + var method = typeof(VectorMgdMgd256).GetMethod(nameof(PassHVA512_01), + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); + var result = (HVA512_01)method.Invoke(null, new object[] { h01 }); + Check("Reflection PassHVA512_01.v0", result.v0 == v); + } + + [Fact] + public static int TestEntryPoint() + { + Console.WriteLine($"Vector256.Count = {Vector256.Count}"); + Console.WriteLine($"Vector512.Count = {Vector512.Count}"); + + if (Avx.IsSupported) + { + // ---- Single Vector256 tests ---- + Console.WriteLine("=== Single Vector256 tests ==="); + + var v256 = Vector256.Create(1, 2, 3, 4, 5, 6, 7, 8); + Check("PassSingle256", PassSingle256(v256) == v256); + + Check("Add256", Add256(Vector256.Create(1), Vector256.Create(2)) == Vector256.Create(3)); + + Check("PassMany256", PassMany256( + Vector256.Create(1), Vector256.Create(2), Vector256.Create(3), Vector256.Create(4)) == Vector256.Create(10)); + + Check("Mixed256", Mixed256(3, Vector256.Create(10), 7L) == Vector256.Create(20)); + + // ---- HVA256 argument tests ---- + Console.WriteLine("=== HVA256 argument tests ==="); + + var hva256_01 = new HVA256_01 { v0 = Vector256.Create(random.Next(100)) }; + var r01 = PassHVA256_01(hva256_01); + Check("PassHVA256_01.v0", r01.v0 == hva256_01.v0); + + var hva256_02 = new HVA256_02 { v0 = Vector256.Create(random.Next(100)), v1 = Vector256.Create(random.Next(100)) }; + var r02 = PassHVA256_02(hva256_02); + Check("PassHVA256_02.v0", r02.v0 == hva256_02.v0); + Check("PassHVA256_02.v1", r02.v1 == hva256_02.v1); + + var hva256_03 = new HVA256_03 + { + v0 = Vector256.Create(random.Next(100)), + v1 = Vector256.Create(random.Next(100)), + v2 = Vector256.Create(random.Next(100)) + }; + var r03 = PassHVA256_03(hva256_03); + Check("PassHVA256_03.v0", r03.v0 == hva256_03.v0); + Check("PassHVA256_03.v1", r03.v1 == hva256_03.v1); + Check("PassHVA256_03.v2", r03.v2 == hva256_03.v2); + + // ---- HVA256 return tests ---- + Console.WriteLine("=== HVA return tests ==="); + + var retH01 = ReturnHVA256_01(Vector256.Create(77)); + Check("ReturnHVA256_01.v0", retH01.v0 == Vector256.Create(77)); + + var retH02 = ReturnHVA256_02(Vector256.Create(10), Vector256.Create(20)); + Check("ReturnHVA256_02.v0", retH02.v0 == Vector256.Create(10)); + Check("ReturnHVA256_02.v1", retH02.v1 == Vector256.Create(20)); + + // ---- Mixed scalar + HVA256 tests ---- + Console.WriteLine("=== Mixed scalar + HVA tests ==="); + + var hMixed = new HVA256_01 { v0 = Vector256.Create(100) }; + Check("HVA256WithScalars", HVA256WithScalars(1, hMixed, 2) == 103); + + // ---- Reflection256 tests ---- + Console.WriteLine("=== Reflection tests ==="); + + TestReflection256(); + } + else + { + Console.WriteLine("=== Skipping Vector256 tests: AVX not supported ==="); + } + + if (Avx512F.IsSupported) + { + // ---- Single Vector512 tests ---- + Console.WriteLine("=== Single Vector512 tests ==="); + + var v512 = Vector512.Create(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + Check("PassSingle512", PassSingle512(v512) == v512); + + Check("Add512", Add512(Vector512.Create(1), Vector512.Create(2)) == Vector512.Create(3)); + + Check("Mixed512", Mixed512(3, Vector512.Create(10), 7L) == Vector512.Create(20)); + + // ---- HVA512 argument tests ---- + Console.WriteLine("=== HVA512 argument tests ==="); + + var hva512_01 = new HVA512_01 { v0 = Vector512.Create(random.Next(100)) }; + var r512_01 = PassHVA512_01(hva512_01); + Check("PassHVA512_01.v0", r512_01.v0 == hva512_01.v0); + + var hva512_02 = new HVA512_02 { v0 = Vector512.Create(random.Next(100)), v1 = Vector512.Create(random.Next(100)) }; + var r512_02 = PassHVA512_02(hva512_02); + Check("PassHVA512_02.v0", r512_02.v0 == hva512_02.v0); + Check("PassHVA512_02.v1", r512_02.v1 == hva512_02.v1); + + // ---- HVA512 return tests ---- + var retH512 = ReturnHVA512_01(Vector512.Create(99)); + Check("ReturnHVA512_01.v0", retH512.v0 == Vector512.Create(99)); + + // ---- Mixed scalar + HVA512 tests ---- + var hMixed512 = new HVA512_01 { v0 = Vector512.Create(200) }; + Check("HVA512WithScalars", HVA512WithScalars(1L, hMixed512, 2L) == 203); + + // ---- Reflection512 tests ---- + TestReflection512(); + } + else + { + Console.WriteLine("=== Skipping Vector512 tests: AVX-512 not supported ==="); + } + + Console.WriteLine(isPassing ? "Test Passed" : "Test FAILED"); + + return isPassing ? PASS : FAIL; + } +} diff --git a/src/tests/JIT/Directed/VectorABI/VectorMgdMgd256_ro.csproj b/src/tests/JIT/Directed/VectorABI/VectorMgdMgd256_ro.csproj new file mode 100644 index 00000000000000..347e499430f708 --- /dev/null +++ b/src/tests/JIT/Directed/VectorABI/VectorMgdMgd256_ro.csproj @@ -0,0 +1,9 @@ + + + True + True + + + + + diff --git a/src/tests/JIT/Directed/VectorABI/VectorNumericsRegPass.cs b/src/tests/JIT/Directed/VectorABI/VectorNumericsRegPass.cs new file mode 100644 index 00000000000000..5642198760cc7d --- /dev/null +++ b/src/tests/JIT/Directed/VectorABI/VectorNumericsRegPass.cs @@ -0,0 +1,132 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using Xunit; + +// Test that System.Numerics.Vector types are passed in registers on Unix AMD64 +// for the active hardware-dependent vector size (for example, 16 bytes with SSE or +// 32 bytes on AVX-capable machines) according to the System V ABI. + +public static class VectorNumericsRegPass +{ + private const int PASS = 100; + private const int FAIL = 0; + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector AddVectors(Vector a, Vector b) + { + return a + b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector MultiplyVectors(Vector a, Vector b) + { + return a * b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector SubtractVectors(Vector a, Vector b) + { + return a - b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static float SumVector(Vector v) + { + float sum = 0; + for (int i = 0; i < Vector.Count; i++) + { + sum += v[i]; + } + return sum; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector PassMultipleVectors(Vector a, Vector b, Vector c, int scalar) + { + return (a + b) * c + new Vector(scalar); + } + + [Fact] + public static int TestEntryPoint() + { + Console.WriteLine($"Vector.Count = {Vector.Count}"); + Console.WriteLine($"Vector size = {Vector.Count * sizeof(float)} bytes"); + Console.WriteLine($"Vector.Count = {Vector.Count}"); + Console.WriteLine($"Vector.Count = {Vector.Count}"); + + // Test with float vectors + var vf1 = new Vector(1.0f); + var vf2 = new Vector(2.0f); + var vfResult = AddVectors(vf1, vf2); + + for (int i = 0; i < Vector.Count; i++) + { + if (Math.Abs(vfResult[i] - 3.0f) > 0.001f) + { + Console.WriteLine($"FAIL: Float vector addition failed at index {i}: {vfResult[i]} != 3.0"); + return FAIL; + } + } + + // Test with int vectors + var vi1 = new Vector(5); + var vi2 = new Vector(3); + var viResult = MultiplyVectors(vi1, vi2); + + for (int i = 0; i < Vector.Count; i++) + { + if (viResult[i] != 15) + { + Console.WriteLine($"FAIL: Int vector multiplication failed at index {i}: {viResult[i]} != 15"); + return FAIL; + } + } + + // Test with double vectors + var vd1 = new Vector(10.0); + var vd2 = new Vector(3.0); + var vdResult = SubtractVectors(vd1, vd2); + + for (int i = 0; i < Vector.Count; i++) + { + if (Math.Abs(vdResult[i] - 7.0) > 0.001) + { + Console.WriteLine($"FAIL: Double vector subtraction failed at index {i}: {vdResult[i]} != 7.0"); + return FAIL; + } + } + + // Test sum operation + var vfSum = new Vector(4.0f); + float sum = SumVector(vfSum); + float expectedSum = 4.0f * Vector.Count; + if (Math.Abs(sum - expectedSum) > 0.001f) + { + Console.WriteLine($"FAIL: Sum operation failed: {sum} != {expectedSum}"); + return FAIL; + } + + // Test multiple vector parameters with scalar mixing + var va = new Vector(1.0f); + var vb = new Vector(2.0f); + var vc = new Vector(3.0f); + var vmResult = PassMultipleVectors(va, vb, vc, 5); + + // (1 + 2) * 3 + 5 = 14 + for (int i = 0; i < Vector.Count; i++) + { + if (Math.Abs(vmResult[i] - 14.0f) > 0.001f) + { + Console.WriteLine($"FAIL: Multiple parameter test failed at index {i}: {vmResult[i]} != 14.0"); + return FAIL; + } + } + + Console.WriteLine("PASS: All tests succeeded!"); + return PASS; + } +} diff --git a/src/tests/JIT/Directed/VectorABI/VectorNumericsRegPass.csproj b/src/tests/JIT/Directed/VectorABI/VectorNumericsRegPass.csproj new file mode 100644 index 00000000000000..297241bb63ea66 --- /dev/null +++ b/src/tests/JIT/Directed/VectorABI/VectorNumericsRegPass.csproj @@ -0,0 +1,9 @@ + + + True + True + + + + + diff --git a/src/tests/JIT/Directed/VectorABI/VectorRegPassSysV.cs b/src/tests/JIT/Directed/VectorABI/VectorRegPassSysV.cs new file mode 100644 index 00000000000000..34ddb8fbbf938f --- /dev/null +++ b/src/tests/JIT/Directed/VectorABI/VectorRegPassSysV.cs @@ -0,0 +1,309 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using Xunit; + +// Test that Vector128, Vector256, and Vector512 are correctly passed as arguments +// and returned from methods on System V x64 (Linux), verifying the single-register +// SIMD passing path in the JIT's SysVX64Classifier. +// +// Vector128 (16B) -> XMM register +// Vector256 (32B) -> YMM register (requires AVX) +// Vector512 (64B) -> ZMM register (requires AVX-512) + +public static class VectorRegPassSysV +{ + private const int PASS = 100; + private const int FAIL = 0; + + // --- Vector128 tests --- + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddVec128(Vector128 a, Vector128 b) + { + return a + b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 PassManyVec128(Vector128 a, Vector128 b, Vector128 c, + Vector128 d, Vector128 e, int scalar) + { + return a + b + c + d + e + Vector128.Create(scalar); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 MixedArgsVec128(int x, Vector128 v, long y) + { + return v + Vector128.Create((float)(x + y)); + } + + // --- Vector256 tests (require AVX) --- + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 AddVec256(Vector256 a, Vector256 b) + { + return a + b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 PassManyVec256(Vector256 a, Vector256 b, Vector256 c, + Vector256 d, int scalar) + { + return a + b + c + d + Vector256.Create(scalar); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 MixedArgsVec256(int x, Vector256 v, long y) + { + return v + Vector256.Create((float)(x + y)); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 ReturnVec256(int value) + { + return Vector256.Create(value); + } + + // --- Vector512 tests (require AVX-512) --- + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector512 AddVec512(Vector512 a, Vector512 b) + { + return a + b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector512 ReturnVec512(int value) + { + return Vector512.Create(value); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector512 MixedArgsVec512(int x, Vector512 v, long y) + { + return v + Vector512.Create((float)(x + y)); + } + + // --- Vector64 tests --- + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector64 AddVec64(Vector64 a, Vector64 b) + { + return a + b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector64 ReturnVec64(float a, float b) + { + return Vector64.Create(a, b); + } + + // --- Chained return tests (return of one call feeds into the next) --- + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 ChainVec128(Vector128 a, Vector128 b, + Vector128 c, Vector128 d) + { + return AddVec128F(AddVec128F(a, b), AddVec128F(c, d)); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddVec128F(Vector128 a, Vector128 b) + { + return a + b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 ChainVec256(Vector256 a, Vector256 b, + Vector256 c, Vector256 d) + { + return AddVec256F(AddVec256F(a, b), AddVec256F(c, d)); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 AddVec256F(Vector256 a, Vector256 b) + { + return a + b; + } + + // --- Vector128 return tests --- + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 AddVec128D(Vector128 a, Vector128 b) + { + return a + b; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector256 AddVec256D(Vector256 a, Vector256 b) + { + return a + b; + } + + // --- Multi-size mixing: different vector widths in one call --- + + [MethodImpl(MethodImplOptions.NoInlining)] + static Vector128 MixedSizes(Vector128 a, Vector256 b, int scalar) + { + return a + b.GetLower() + Vector128.Create(scalar); + } + + // --- Return into struct field --- + + struct VectorPair128 + { + public Vector128 Lo; + public Vector128 Hi; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static VectorPair128 ReturnPair128(Vector128 a, Vector128 b) + { + VectorPair128 result; + result.Lo = AddVec128(a, Vector128.Create(1)); + result.Hi = AddVec128(b, Vector128.Create(2)); + return result; + } + + // --- Test helpers --- + + static bool Check(T actual, T expected, string testName) where T : IEquatable + { + if (!actual.Equals(expected)) + { + Console.WriteLine($"FAIL: {testName}: expected {expected}, got {actual}"); + return false; + } + return true; + } + + [Fact] + public static int TestEntryPoint() + { + bool pass = true; + + // --- Vector64 tests --- + Console.WriteLine("=== Vector64 tests ==="); + + pass &= Check(AddVec64(Vector64.Create(1, 2), Vector64.Create(10, 20)), + Vector64.Create(11, 22), "AddVec64"); + + pass &= Check(ReturnVec64(3.0f, 4.0f), Vector64.Create(3.0f, 4.0f), "ReturnVec64"); + + // --- Vector128 tests (always available on x64) --- + Console.WriteLine("=== Vector128 tests ==="); + + var v128a = Vector128.Create(1, 2, 3, 4); + var v128b = Vector128.Create(10, 20, 30, 40); + pass &= Check(AddVec128(v128a, v128b), Vector128.Create(11, 22, 33, 44), "AddVec128"); + + pass &= Check( + PassManyVec128( + Vector128.Create(1), Vector128.Create(2), Vector128.Create(3), + Vector128.Create(4), Vector128.Create(5), 100), + Vector128.Create(115), + "PassManyVec128"); + + pass &= Check(MixedArgsVec128(3, Vector128.Create(10.0f), 7L), + Vector128.Create(20.0f), "MixedArgsVec128"); + + // --- Vector128 tests --- + pass &= Check(AddVec128D(Vector128.Create(1.5, 2.5), Vector128.Create(3.0, 4.0)), + Vector128.Create(4.5, 6.5), "AddVec128D"); + + // --- Vector128 chained return tests --- + // ChainVec128: (1+2) + (0.5+0.1) = 3.6 + pass &= Check( + ChainVec128(Vector128.Create(1.0f), Vector128.Create(2.0f), + Vector128.Create(0.5f), Vector128.Create(0.1f)), + Vector128.Create(3.6f), "ChainVec128"); + + // --- Vector256 tests --- + Console.WriteLine("=== Vector256 tests ==="); + Console.WriteLine($" Avx.IsSupported = {Avx.IsSupported}"); + + if (Avx.IsSupported) + { + var v256a = Vector256.Create(1, 2, 3, 4, 5, 6, 7, 8); + var v256b = Vector256.Create(10, 20, 30, 40, 50, 60, 70, 80); + pass &= Check(AddVec256(v256a, v256b), + Vector256.Create(11, 22, 33, 44, 55, 66, 77, 88), "AddVec256"); + + pass &= Check( + PassManyVec256( + Vector256.Create(1), Vector256.Create(2), Vector256.Create(3), + Vector256.Create(4), 100), + Vector256.Create(110), + "PassManyVec256"); + + pass &= Check(MixedArgsVec256(3, Vector256.Create(10.0f), 7L), + Vector256.Create(20.0f), "MixedArgsVec256"); + + pass &= Check(ReturnVec256(42), Vector256.Create(42), "ReturnVec256"); + + // --- Vector256 tests --- + pass &= Check(AddVec256D(Vector256.Create(1.0), Vector256.Create(2.0)), + Vector256.Create(3.0), "AddVec256D"); + + // --- Vector256 chained return tests --- + pass &= Check( + ChainVec256(Vector256.Create(1.0f), Vector256.Create(2.0f), + Vector256.Create(0.5f), Vector256.Create(0.1f)), + Vector256.Create(3.6f), "ChainVec256"); + } + else + { + Console.WriteLine(" Skipping Vector256 tests because AVX is not supported."); + } + + // --- Vector512 tests --- + Console.WriteLine("=== Vector512 tests ==="); + Console.WriteLine($" Avx512F.IsSupported = {Avx512F.IsSupported}"); + + if (Avx512F.IsSupported) + { + var v512a = Vector512.Create(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); + var v512b = Vector512.Create(10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160); + pass &= Check(AddVec512(v512a, v512b), + Vector512.Create(11, 22, 33, 44, 55, 66, 77, 88, 99, 110, 121, 132, 143, 154, 165, 176), + "AddVec512"); + + pass &= Check(ReturnVec512(99), Vector512.Create(99), "ReturnVec512"); + + pass &= Check(MixedArgsVec512(3, Vector512.Create(10.0f), 7L), + Vector512.Create(20.0f), "MixedArgsVec512"); + } + else + { + Console.WriteLine(" Skipping Vector512 tests because AVX-512 is not supported."); + } + + // --- Multi-size mixing tests --- + Console.WriteLine("=== Multi-size mixing tests ==="); + + if (Avx.IsSupported) + { + pass &= Check( + MixedSizes(Vector128.Create(1), Vector256.Create(10), 5), + Vector128.Create(16), "MixedSizes_128_256"); + } + else + { + Console.WriteLine(" Skipping multi-size mixing tests because AVX is not supported."); + } + + // --- Return into struct field tests --- + Console.WriteLine("=== Return into struct tests ==="); + + var pair = ReturnPair128(Vector128.Create(10), Vector128.Create(20)); + pass &= Check(pair.Lo, Vector128.Create(11), "ReturnPair128.Lo"); + pass &= Check(pair.Hi, Vector128.Create(22), "ReturnPair128.Hi"); + + Console.WriteLine(pass ? "PASS" : "FAIL"); + return pass ? PASS : FAIL; + } +} diff --git a/src/tests/JIT/Directed/VectorABI/VectorRegPassSysV.csproj b/src/tests/JIT/Directed/VectorABI/VectorRegPassSysV.csproj new file mode 100644 index 00000000000000..fc2728afce92c6 --- /dev/null +++ b/src/tests/JIT/Directed/VectorABI/VectorRegPassSysV.csproj @@ -0,0 +1,9 @@ + + + True + True + + + + + diff --git a/src/tests/JIT/Stress/ABI/ABIs.cs b/src/tests/JIT/Stress/ABI/ABIs.cs index fbe547f14a2d7b..0c593c8f8b2a87 100644 --- a/src/tests/JIT/Stress/ABI/ABIs.cs +++ b/src/tests/JIT/Stress/ABI/ABIs.cs @@ -48,7 +48,7 @@ internal class Win86Abi : IAbi { typeof(byte), typeof(short), typeof(int), typeof(long), typeof(float), typeof(double), typeof(Int128), - typeof(Vector), typeof(Vector128), typeof(Vector256), + typeof(Vector), typeof(Vector128), typeof(Vector256), typeof(Vector512), typeof(S1P), typeof(S2P), typeof(S2U), typeof(S3U), typeof(S4P), typeof(S4U), typeof(S5U), typeof(S6U), typeof(S7U), typeof(S8P), typeof(S8U), typeof(S9U), @@ -106,7 +106,7 @@ internal class SysVAbi : IAbi { typeof(byte), typeof(short), typeof(int), typeof(long), typeof(float), typeof(double), - typeof(Vector), typeof(Vector128), typeof(Vector256), + typeof(Vector), typeof(Vector128), typeof(Vector256), typeof(Vector512), typeof(Int128), typeof(S1P), typeof(S2P), typeof(S2U), typeof(S3U), typeof(S4P), typeof(S4U), typeof(S5U), typeof(S6U),