dotnet · kunalspathak · Jun 28, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
@@ -215,6 +215,9 @@ CodeGen::CodeGen(Compiler* theCompiler)
 #endif // DEBUG
 
     regSet.tmpInit();
+#if defined(TARGET_ARM64)
+    predicateOffset = 0;
+#endif
 
 #ifdef LATE_DISASM
     getDisAssembler().disInit(compiler);

diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h
@@ -141,7 +141,12 @@ class CodeGenInterface
     RegState              intRegState;
     RegState              floatRegState;
     NodeInternalRegisters internalRegisters;
-
+#if defined(TARGET_ARM64)
+    // This is used to track the stack offset of first *Temp*
+    // predicate register and is then used to produce
+    // the stack address to ld/st it on the stack.
+    int predicateOffset;
+#endif // TARGET_ARM64
 protected:
     Compiler* compiler;
     bool      m_genAlignLoops;

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
@@ -1430,12 +1430,21 @@ class TempDsc
     int       tdNum;
     BYTE      tdSize;
     var_types tdType;
+#if defined(TARGET_ARM64)
+    // Only used for TYP_MASK to track the sequence of predicate
+    // registers temps. We use this to ld/st them from stack
+    // using `ldr pX, [sp, #seqNum mul vl]
+    BYTE tdSeqNum;
+#endif // TARGET_ARM64
 
 public:
-    TempDsc(int _tdNum, unsigned _tdSize, var_types _tdType)
+    TempDsc(int _tdNum, unsigned _tdSize, var_types _tdType, unsigned _tdSeqNum)
         : tdNum(_tdNum)
         , tdSize((BYTE)_tdSize)
         , tdType(_tdType)
+#if defined(TARGET_ARM64)
+        , tdSeqNum((BYTE)_tdSeqNum)
+#endif // TARGET_ARM64
     {
 #ifdef DEBUG
         // temps must have a negative number (so they have a different number from all local variables)
@@ -1484,6 +1493,13 @@ class TempDsc
     {
         return tdType;
     }
+#ifdef TARGET_ARM64
+    unsigned tdTempSeqNum() const
+    {
+        assert(varTypeIsMask(tdType));
+        return tdSeqNum;
+    }
+#endif
 };
 
 // Specify compiler data that a phase might modify

diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp
@@ -2708,7 +2708,16 @@ inline
                 tmpDsc = codeGen->regSet.tmpFindNum(varNum, RegSet::TEMP_USAGE_USED);
             }
             assert(tmpDsc != nullptr);
-            varOffset = tmpDsc->tdTempOffs();
+#if defined(TARGET_ARM64)
+            if (varTypeIsMask(tmpDsc->tdTempType()))
+            {
+                varOffset = tmpDsc->tdTempSeqNum();
+            }
+            else
+#endif // TARGET_ARM64
+            {
+                varOffset = tmpDsc->tdTempOffs();
+            }
         }
         else
         {

diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
@@ -7884,7 +7884,36 @@ void emitter::emitIns_R_S(instruction ins, emitAttr attr, regNumber reg1, int va
             isSimple = false;
             size     = EA_SCALABLE;
             attr     = size;
-            fmt      = isVectorRegister(reg1) ? IF_SVE_IE_2A : IF_SVE_ID_2A;
+            if (isPredicateRegister(reg1))
+            {
+                assert(offs == 0);
+                // For predicate, generate based of rsGetRsvdReg()
+                regNumber rsvdReg = codeGen->rsGetRsvdReg();
+
+                if (varx >= 0)
+                {
+                    // local
+
+                    // add rsvd, fp, #imm
+                    emitIns_R_R_I(INS_add, EA_8BYTE, rsvdReg, reg2, imm);
+                    // str p0, [rsvd, #0, mul vl]
+                    emitIns_R_R_I(ins, attr, reg1, rsvdReg, 0);
+                }
+                else
+                {
+                    // temp
+
+                    // `base` contains seqNum and offs = 0, so imm contains seqNum
+                    // add rsvd, fp #predicateStartOffset
+                    emitIns_R_R_I(INS_add, EA_8BYTE, rsvdReg, reg2, codeGen->predicateOffset);
+                    // str p0, [rsvd, #imm, mul vl]
+                    emitIns_R_R_I(ins, attr, reg1, rsvdReg, imm);
+                }
+                return;
+            }
+
+            assert(isVectorRegister(reg1));
+            fmt = IF_SVE_IE_2A;
 
             // TODO-SVE: Don't assume 128bit vectors
             // Predicate size is vector length / 8
@@ -8135,7 +8164,38 @@ void emitter::emitIns_S_R(instruction ins, emitAttr attr, regNumber reg1, int va
             isSimple = false;
             size     = EA_SCALABLE;
             attr     = size;
-            fmt      = isVectorRegister(reg1) ? IF_SVE_JH_2A : IF_SVE_JG_2A;
+
+            if (isPredicateRegister(reg1))
+            {
+                assert(offs == 0);
+
+                // For predicate, generate based of rsGetRsvdReg()
+                regNumber rsvdReg = codeGen->rsGetRsvdReg();
+
+                if (varx >= 0)
+                {
+                    // local
+
+                    // add rsvd, fp, #imm
+                    emitIns_R_R_I(INS_add, EA_8BYTE, rsvdReg, reg2, imm);
+                    // str p0, [rsvd, #0, mul vl]
+                    emitIns_R_R_I(ins, attr, reg1, rsvdReg, 0);
+                }
+                else
+                {
+                    // temp
+
+                    // `base` contains seqNum and offs = 0, so imm contains seqNum
+                    // add rsvd, fp #predicateStartOffset
+                    emitIns_R_R_I(INS_add, EA_8BYTE, rsvdReg, reg2, codeGen->predicateOffset);
+                    // str p0, [rsvd, #seqNum, mul vl]
+                    emitIns_R_R_I(ins, attr, reg1, rsvdReg, imm);
+                }
+                return;
+            }
+
+            assert(isVectorRegister(reg1));
+            fmt = IF_SVE_JH_2A;
 
             // TODO-SVE: Don't assume 128bit vectors
             // Predicate size is vector length / 8

diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h
@@ -1205,6 +1205,11 @@ inline static bool isHighPredicateRegister(regNumber reg)
     return (reg >= REG_PREDICATE_HIGH_FIRST) && (reg <= REG_PREDICATE_HIGH_LAST);
 }
 
+inline static bool isMaskReg(regNumber reg)
+{
+    return isPredicateRegister(reg);
+}
+
 inline static bool isEvenRegister(regNumber reg)
 {
     if (isGeneralRegister(reg))

diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp
@@ -5593,7 +5593,7 @@ unsigned Compiler::lvaGetMaxSpillTempSize()
  *  Doing this all in one pass is 'hard'.  So instead we do it in 2 basic passes:
  *    1. Assign all the offsets relative to the Virtual '0'. Offsets above (the
  *      incoming arguments) are positive. Offsets below (everything else) are
- *      negative.  This pass also calcuates the total frame size (between Caller's
+ *      negative.  This pass also calculates the total frame size (between Caller's
  *      SP/return address and the Ambient SP).
  *    2. Figure out where to place the frame pointer, and then adjust the offsets
  *      as needed for the final stack size and whether the offset is frame pointer
@@ -5872,6 +5872,14 @@ void Compiler::lvaFixVirtualFrameOffsets()
     for (TempDsc* temp = codeGen->regSet.tmpListBeg(); temp != nullptr; temp = codeGen->regSet.tmpListNxt(temp))
     {
         temp->tdAdjustTempOffs(delta);
+#if defined(TARGET_ARM64)
 for (int i = 0; i < TYP_COUNT; i++) 
 { 
     if (var_types(i) != RegSet::tmpNormalizeType(var_types(i))) 
     { 
         // Only normalized types should have anything in the maxSpill array. 
         // We assume here that if type 'i' does not normalize to itself, then 
         // nothing else normalizes to 'i', either. 
         assert(maxSpill[i] == 0); 
     } 
     if (maxSpill[i] != 0) 
     { 
         JITDUMP("  %s: %d\n", varTypeName(var_types(i)), maxSpill[i]); 
         compiler->codeGen->regSet.tmpPreAllocateTemps(var_types(i), maxSpill[i]); 
     } 
 } 
     for (unsigned i = 0; i < count; i++) 
     { 
         tmpCount++; 
         tmpSize += size; 
 #ifdef TARGET_ARM 
         if (type == TYP_DOUBLE) 
         { 
             // Adjust tmpSize to accommodate possible alignment padding. 
             // Note that at this point the offsets aren't yet finalized, so we don't yet know if it will be required. 
             tmpSize += TARGET_POINTER_SIZE; 
         } 
 #endif // TARGET_ARM 
         TempDsc* temp = new (m_rsCompiler, CMK_Unknown) TempDsc(-((int)tmpCount), size, type); 
 for (int i = 0; i < TYP_COUNT; i++) 
 { 
     if (var_types(i) != RegSet::tmpNormalizeType(var_types(i))) 
     { 
         // Only normalized types should have anything in the maxSpill array. 
         // We assume here that if type 'i' does not normalize to itself, then 
         // nothing else normalizes to 'i', either. 
         assert(maxSpill[i] == 0); 
     } 
     if (maxSpill[i] != 0) 
     { 
         JITDUMP("  %s: %d\n", varTypeName(var_types(i)), maxSpill[i]); 
         compiler->codeGen->regSet.tmpPreAllocateTemps(var_types(i), maxSpill[i]); 
     } 
 } 
     for (unsigned i = 0; i < count; i++) 
     { 
         tmpCount++; 
         tmpSize += size; 
  
 #ifdef TARGET_ARM 
         if (type == TYP_DOUBLE) 
         { 
             // Adjust tmpSize to accommodate possible alignment padding. 
             // Note that at this point the offsets aren't yet finalized, so we don't yet know if it will be required. 
             tmpSize += TARGET_POINTER_SIZE; 
         } 
 #endif // TARGET_ARM 
  
         TempDsc* temp = new (m_rsCompiler, CMK_Unknown) TempDsc(-((int)tmpCount), size, type); 
+        if (varTypeIsMask(temp->tdTempType()) && temp->tdTempSeqNum() == 0)
+        {
+            // For the first register, store the offset, which we will use to
+            // generate the offsets for subsequent temp mask registers
+            codeGen->predicateOffset = temp->tdTempOffs();
+        }
+#endif
     }
 
     lvaCachedGenericContextArgOffs += delta;

diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h
@@ -508,13 +508,13 @@ class RegRecord : public Referenceable
         {
             registerType = FloatRegisterType;
         }
-#if defined(TARGET_XARCH) && defined(FEATURE_SIMD)
+#if defined(FEATURE_MASKED_HW_INTRINSICS)
         else
         {
             assert(emitter::isMaskReg(reg));
             registerType = MaskRegisterType;
         }
-#endif
+#endif // FEATURE_MASKED_HW_INTRINSICS
         regNum       = reg;
         isCalleeSave = ((RBM_CALLEE_SAVED & genRegMask(reg)) != 0);
     }

diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp
@@ -855,6 +855,9 @@ regMaskTP LinearScan::getKillSetForCall(GenTreeCall* call)
 
 #else
         killMask.RemoveRegsetForType(RBM_FLT_CALLEE_TRASH.GetFloatRegSet(), FloatRegisterType);
+#if defined(TARGET_ARM64)
+        killMask.RemoveRegsetForType(RBM_MSK_CALLEE_TRASH.GetFloatRegSet(), MaskRegisterType);
+#endif // TARGET_ARM64
 #endif // TARGET_XARCH
     }
 #ifdef TARGET_ARM
@@ -1148,8 +1151,8 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo
                     {
                         continue;
                     }
-                Interval*        interval     = getIntervalForLocalVar(varIndex);
-                const bool       isCallKill   = ((killMask == RBM_INT_CALLEE_TRASH) || (killMask == RBM_CALLEE_TRASH));
+                Interval*  interval   = getIntervalForLocalVar(varIndex);
+                const bool isCallKill = ((killMask.getLow() == RBM_INT_CALLEE_TRASH) || (killMask == RBM_CALLEE_TRASH));
                 SingleTypeRegSet regsKillMask = killMask.GetRegSetForType(interval->registerType);
 
                 if (isCallKill)

diff --git a/src/coreclr/jit/regset.cpp b/src/coreclr/jit/regset.cpp
@@ -705,7 +705,7 @@ void RegSet::tmpPreAllocateTemps(var_types type, unsigned count)
         }
 #endif // TARGET_ARM
 
-        TempDsc* temp = new (m_rsCompiler, CMK_Unknown) TempDsc(-((int)tmpCount), size, type);
+        TempDsc* temp = new (m_rsCompiler, CMK_Unknown) TempDsc(-((int)tmpCount), size, type, i);
 
 #ifdef DEBUG
         if (m_rsCompiler->verbose)

diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h
@@ -75,8 +75,15 @@
   #define RBM_FLT_CALLEE_SAVED    (RBM_V8|RBM_V9|RBM_V10|RBM_V11|RBM_V12|RBM_V13|RBM_V14|RBM_V15)
   #define RBM_FLT_CALLEE_TRASH    (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V16|RBM_V17|RBM_V18|RBM_V19|RBM_V20|RBM_V21|RBM_V22|RBM_V23|RBM_V24|RBM_V25|RBM_V26|RBM_V27|RBM_V28|RBM_V29|RBM_V30|RBM_V31)
 
+  #define RBM_LOWMASK              (RBM_P0|RBM_P1|RBM_P2|RBM_P3|RBM_P4|RBM_P5|RBM_P6|RBM_P7)
+  #define RBM_HIGHMASK             (RBM_P8|RBM_P9|RBM_P10| RBM_P11|RBM_P12|RBM_P13|RBM_P14|RBM_P15)
+  #define RBM_ALLMASK              (RBM_LOWMASK|RBM_HIGHMASK)
+
+  #define RBM_MSK_CALLEE_SAVED    (0)
+  #define RBM_MSK_CALLEE_TRASH    RBM_ALLMASK
 compiler->compFloatingPointUsed = true; 
 compiler->compFloatingPointUsed = true; 
+
   #define RBM_CALLEE_SAVED        (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED)
-  #define RBM_CALLEE_TRASH        (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH)
+  #define RBM_CALLEE_TRASH        (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH | RBM_MSK_CALLEE_TRASH)
 
   #define REG_DEFAULT_HELPER_CALL_TARGET REG_R12
   #define RBM_DEFAULT_HELPER_CALL_TARGET RBM_R12
@@ -146,14 +153,6 @@
   #define REG_JUMP_THUNK_PARAM     REG_R12
   #define RBM_JUMP_THUNK_PARAM     RBM_R12
 
-  #define RBM_LOWMASK              (RBM_P0 | RBM_P1 | RBM_P2 | RBM_P3 | RBM_P4 | RBM_P5 | RBM_P6 | RBM_P7)
-  #define RBM_HIGHMASK             (RBM_P8 | RBM_P9 | RBM_P10 | RBM_P11 | RBM_P12 | RBM_P13 | RBM_P14 | RBM_P15)
-  #define RBM_ALLMASK              (RBM_LOWMASK | RBM_HIGHMASK)
-
-  // TODO-SVE: Fix when adding predicate register allocation
-  #define RBM_MSK_CALLEE_SAVED    (0)
-  #define RBM_MSK_CALLEE_TRASH    (0)
-
   // ARM64 write barrier ABI (see vm\arm64\asmhelpers.asm, vm\arm64\asmhelpers.S):
   // CORINFO_HELP_ASSIGN_REF (JIT_WriteBarrier), CORINFO_HELP_CHECKED_ASSIGN_REF (JIT_CheckedWriteBarrier):
   //     On entry: