Arm64 apple vm fixes for arg alignment. (#46665)

* Add `MarshalInfo::IsValueClass`. * Add `TypeHandle* pTypeHandle` to `SizeOf`. * Add a few asserts/start using inline function instead of macro. * use TARGET_POINTER_SIZE instead of STACK_ELEM_SIZE. * Use `m_curOfs` instead of `m_idxStack` in `ArgIteratorBase` on all platforms. Before some platforms were using stackSlots, some curOfs in bytes. * Use byte sizes and offsets in `ArgLocDesc`. Fix arm32. x86 fixes. use StackSize on ArgSizes Add `GetStackArgumentByteIndexFromOffset` and return back the old values for asserts. another fix * Stop using `#define STACK_ELEM_SIZE` * Add `isFloatHfa`. * delete checking code. because it won't pass on arm64 apple. * arm64 apple fixes. * roundUp the stack size. * Add a reflection test. * Return byte offset from `GetNextOfs`. It is not a complete fix for arm64 apple, but covers most cases. * Add `FLOAT_REGISTER_SIZE` * Use StackElemSize for ` pLoc->m_byteStackSize`. * replace `assert` with `_ASSERTE`. * Use `ALIGN_UP` in the code that I have changed. * rename `m_curOfs` as `m_ofsStack`. * delete "ceremony " from `StackElemSize`. * Delete `cSlots` and don't call `StackElemSize` on `GetArgSize`. * Fix an assert. * Fix nit. * fix wrong return for hfa<float>. * fix nit. * Fix crossgen job.
dotnet · Jan 21, 2021 · 648437b · 648437b
1 parent bf573c7
commit 648437b
Show file tree

Hide file tree

Showing 23 changed files with 385 additions and 195 deletions.
diff --git a/src/coreclr/classlibnative/bcltype/varargsnative.cpp b/src/coreclr/classlibnative/bcltype/varargsnative.cpp
@@ -21,7 +21,7 @@
 // pointer to achieve such an alignment for the next argument on those platforms (otherwise it is a no-op).
 // NOTE: the debugger has its own implementation of this algorithm in Debug\DI\RsType.cpp, CordbType::RequiresAlign8()
 //       so if you change this implementation be sure to update the debugger's version as well.
-static void AdjustArgPtrForAlignment(VARARGS *pData, size_t cbArg)
+static void AdjustArgPtrForAlignment(VARARGS *pData, unsigned cbArg)
 {
 #ifdef TARGET_ARM
     // Only 64-bit primitives or value types with embedded 64-bit primitives are aligned on 64-bit boundaries.
@@ -114,7 +114,9 @@ static void InitCommon(VARARGS *data, VASigCookie** cookie)
     // which is the start of the first fixed arg (arg1).
 
     // Always skip over the varargs_cookie.
-    data->ArgPtr += StackElemSize(sizeof(LPVOID));
+    const bool isValueType = false;
+    const bool isFloatHfa = false;
+    data->ArgPtr += StackElemSize(TARGET_POINTER_SIZE, isValueType, isFloatHfa);
 #endif
 }
 
@@ -138,9 +140,11 @@ void AdvanceArgPtr(VARARGS *data)
             break;
 
         SigTypeContext      typeContext; // This is an empty type context.  This is OK because the vararg methods may not be generic
-        SIZE_T cbRaw = data->SigPtr.SizeOf(data->ArgCookie->pModule, &typeContext);
-        SIZE_T cbArg = StackElemSize(cbRaw);
-
+        TypeHandle thValueType;
+        const unsigned cbRaw = data->SigPtr.SizeOf(data->ArgCookie->pModule, &typeContext, &thValueType);
+        const bool isValueType = (!thValueType.IsNull() && thValueType.IsValueType());
+        const bool isFloatHfa = false;
+        unsigned cbArg = StackElemSize(cbRaw, isValueType, isFloatHfa);
 #ifdef ENREGISTERED_PARAMTYPE_MAXSIZE
         if (ArgIterator::IsVarArgPassedByRef(cbRaw))
             cbArg = sizeof(void*);
@@ -263,9 +267,11 @@ VarArgsNative::Init2,
             }
 
             SigTypeContext typeContext; // This is an empty type context.  This is OK because the vararg methods may not be generic
-            SIZE_T cbRaw = data->SigPtr.SizeOf(data->ArgCookie->pModule,&typeContext);
-            SIZE_T cbArg = StackElemSize(cbRaw);
-
+            TypeHandle thValueType;
+            unsigned cbRaw = data->SigPtr.SizeOf(data->ArgCookie->pModule,&typeContext, &thValueType);
+            const bool isValueType = (!thValueType.IsNull() && thValueType.IsValueType());
+            const bool isFloatHfa = false;
+            unsigned cbArg = StackElemSize(cbRaw, isValueType, isFloatHfa);
 #ifdef ENREGISTERED_PARAMTYPE_MAXSIZE
             if (ArgIterator::IsVarArgPassedByRef(cbRaw))
                 cbArg = sizeof(void*);
@@ -401,7 +407,8 @@ FCIMPL3(void, VarArgsNative::GetNextArg2, VARARGS* _this, void * value, ReflectC
     TypeHandle typehandle = refType->GetType();
 
     _ASSERTE(_this != NULL);
-    UINT size = 0;
+    unsigned size = 0;
+    bool isValueType = false;
 
     CorElementType typ = typehandle.GetInternalCorElementType();
     if (CorTypeInfo::IsPrimitiveType(typ))
@@ -414,15 +421,15 @@ FCIMPL3(void, VarArgsNative::GetNextArg2, VARARGS* _this, void * value, ReflectC
     }
     else if (typ == ELEMENT_TYPE_VALUETYPE)
     {
+        isValueType = true;
         size = typehandle.AsMethodTable()->GetNativeSize();
     }
     else
     {
         COMPlusThrow(kNotSupportedException, W("NotSupported_Type"));
     }
-
-    size = StackElemSize(size);
-
+    const bool isFloatHfa = false;
+    size = StackElemSize(size, isValueType, isFloatHfa);
     AdjustArgPtrForAlignment(_this, size);
 
 #ifdef ENREGISTERED_PARAMTYPE_MAXSIZE
@@ -472,9 +479,11 @@ VarArgsNative::GetNextArgHelper(
     _ASSERTE(data->RemainingArgs != 0);
 
     SigTypeContext typeContext; // This is an empty type context.  This is OK because the vararg methods may not be generic
-    SIZE_T cbRaw = data->SigPtr.SizeOf(data->ArgCookie->pModule,&typeContext);
-    SIZE_T cbArg = StackElemSize(cbRaw);
-
+    TypeHandle thValueType;
+    const unsigned cbRaw = data->SigPtr.SizeOf(data->ArgCookie->pModule,&typeContext, &thValueType);
+    const bool isValueType = (!thValueType.IsNull() && thValueType.IsValueType());
+    const bool isFloatHfa = false;
+    unsigned cbArg = StackElemSize(cbRaw, isValueType, isFloatHfa);
     AdjustArgPtrForAlignment(data, cbArg);
 
     // Get a pointer to the beginning of the argument.

diff --git a/src/coreclr/gcinfo/gcinfoencoder.cpp b/src/coreclr/gcinfo/gcinfoencoder.cpp
@@ -577,8 +577,10 @@ GcSlotId GcInfoEncoder::GetStackSlotId( INT32 spOffset, GcSlotFlags flags, GcSta
 
     _ASSERTE( (flags & (GC_SLOT_IS_REGISTER | GC_SLOT_IS_DELETED)) == 0 );
 
+#if !defined(OSX_ARM64_ABI)
     // the spOffset for the stack slot is required to be pointer size aligned
     _ASSERTE((spOffset % TARGET_POINTER_SIZE) == 0);
+#endif
 
     m_SlotTable[ m_NumSlots ].Slot.Stack.SpOffset = spOffset;
     m_SlotTable[ m_NumSlots ].Slot.Stack.Base = spBase;

diff --git a/src/coreclr/inc/stdmacros.h b/src/coreclr/inc/stdmacros.h
@@ -185,17 +185,11 @@ inline size_t ALIGN_UP( size_t val, size_t alignment )
     _ASSERTE( result >= val );      // check for overflow
     return result;
 }
-inline void* ALIGN_UP( void* val, size_t alignment )
-{
-    WRAPPER_NO_CONTRACT;
 
-    return (void*) ALIGN_UP( (size_t)val, alignment );
-}
-inline uint8_t* ALIGN_UP( uint8_t* val, size_t alignment )
+template <typename T> inline T ALIGN_UP(T val, size_t alignment)
 {
     WRAPPER_NO_CONTRACT;
-
-    return (uint8_t*) ALIGN_UP( (size_t)val, alignment );
+    return (T)ALIGN_UP((size_t)val, alignment);
 }
 
 inline size_t ALIGN_DOWN( size_t val, size_t alignment )

diff --git a/src/coreclr/vm/amd64/cgencpu.h b/src/coreclr/vm/amd64/cgencpu.h
@@ -101,6 +101,8 @@ EXTERN_C void FastCallFinalizeWorker(Object *obj, PCODE funcPtr);
 #define X86RegFromAMD64Reg(extended_reg) \
             ((X86Reg)(((int)extended_reg) & X86_REGISTER_MASK))
 
+#define FLOAT_REGISTER_SIZE 16 // each register in FloatArgumentRegisters is 16 bytes.
+
 // Why is the return value ARG_SLOT? On 64-bit systems, that is 64-bits
 // and much bigger than necessary for R4, requiring explicit downcasts.
 inline
@@ -145,11 +147,11 @@ void     R8ToFPSpill(void* pSpillSlot, SIZE_T  srcDoubleAsSIZE_T)
 // Parameter size
 //**********************************************************************
 
-typedef INT64 StackElemType;
-#define STACK_ELEM_SIZE sizeof(StackElemType)
-
-// !! This expression assumes STACK_ELEM_SIZE is a power of 2.
-#define StackElemSize(parmSize) (((parmSize) + STACK_ELEM_SIZE - 1) & ~((ULONG)(STACK_ELEM_SIZE - 1)))
+inline unsigned StackElemSize(unsigned parmSize, bool isValueType = false /* unused */, bool isFloatHfa = false /* unused */)
+{
+    const unsigned stackSlotSize = 8;
+    return ALIGN_UP(parmSize, stackSlotSize);
+}
 
 //**********************************************************************
 // Frames

diff --git a/src/coreclr/vm/arm/cgencpu.h b/src/coreclr/vm/arm/cgencpu.h
@@ -95,15 +95,17 @@ EXTERN_C void setFPReturn(int fpSize, INT64 retVal);
 // Offset of pc register
 #define PC_REG_RELATIVE_OFFSET 4
 
+#define FLOAT_REGISTER_SIZE 4 // each register in FloatArgumentRegisters is 4 bytes.
+
 //**********************************************************************
 // Parameter size
 //**********************************************************************
 
-typedef INT32 StackElemType;
-#define STACK_ELEM_SIZE sizeof(StackElemType)
-
-// !! This expression assumes STACK_ELEM_SIZE is a power of 2.
-#define StackElemSize(parmSize) (((parmSize) + STACK_ELEM_SIZE - 1) & ~((ULONG)(STACK_ELEM_SIZE - 1)))
+inline unsigned StackElemSize(unsigned parmSize, bool isValueType = false /* unused */, bool isFloatHfa = false /* unused */)
+{
+    const unsigned stackSlotSize = 4;
+    return ALIGN_UP(parmSize, stackSlotSize);
+}
 
 //**********************************************************************
 // Frames

diff --git a/src/coreclr/vm/arm64/cgencpu.h b/src/coreclr/vm/arm64/cgencpu.h
@@ -58,6 +58,8 @@ extern PCODE GetPreStubEntryPoint();
 #define CALLDESCR_FPARGREGS                     1   // CallDescrWorker has FloatArgumentRegisters parameter
 #define CALLDESCR_RETBUFFARGREG                 1   // CallDescrWorker has RetBuffArg parameter that's separate from arg regs
 
+#define FLOAT_REGISTER_SIZE 16 // each register in FloatArgumentRegisters is 16 bytes.
+
 // Given a return address retrieved during stackwalk,
 // this is the offset by which it should be decremented to arrive at the callsite.
 #define STACKWALK_CONTROLPC_ADJUST_OFFSET 4
@@ -81,13 +83,27 @@ void     R8ToFPSpill(void* pSpillSlot, SIZE_T  srcDoubleAsSIZE_T)
 // Parameter size
 //**********************************************************************
 
-typedef INT64 StackElemType;
-#define STACK_ELEM_SIZE sizeof(StackElemType)
-
-// The expression below assumes STACK_ELEM_SIZE is a power of 2, so check that.
-static_assert(((STACK_ELEM_SIZE & (STACK_ELEM_SIZE-1)) == 0), "STACK_ELEM_SIZE must be a power of 2");
+inline unsigned StackElemSize(unsigned parmSize, bool isValueType, bool isFloatHfa)
+{
+#if defined(OSX_ARM64_ABI)
+    if (!isValueType)
+    {
+        // The primitive types' sizes are expected to be powers of 2.
+        _ASSERTE((parmSize & (parmSize - 1)) == 0);
+        // No padding/alignment for primitive types.
+        return parmSize;
+    }
+    if (isFloatHfa)
+    {
+        _ASSERTE((parmSize % 4) == 0);
+        // float hfa is not considered a struct type and passed with 4-byte alignment.
+        return parmSize;
+    }
+#endif
 
-#define StackElemSize(parmSize) (((parmSize) + STACK_ELEM_SIZE - 1) & ~((ULONG)(STACK_ELEM_SIZE - 1)))
+    const unsigned stackSlotSize = 8;
+    return ALIGN_UP(parmSize, stackSlotSize);
+}
 
 //
 // JIT HELPERS.

diff --git a/src/coreclr/vm/callhelpers.cpp b/src/coreclr/vm/callhelpers.cpp
@@ -517,7 +517,8 @@ void MethodDescCallSite::CallTargetWorker(const ARG_SLOT *pArguments, ARG_SLOT *
     CallDescrData callDescrData;
 
     callDescrData.pSrc = pTransitionBlock + sizeof(TransitionBlock);
-    callDescrData.numStackSlots = nStackBytes / STACK_ELEM_SIZE;
+    _ASSERTE((nStackBytes % TARGET_POINTER_SIZE) == 0);
+    callDescrData.numStackSlots = nStackBytes / TARGET_POINTER_SIZE;
 #ifdef CALLDESCR_ARGREGS
     callDescrData.pArgumentRegisters = (ArgumentRegisters*)(pTransitionBlock + TransitionBlock::GetOffsetOfArgumentRegisters());
 #endif