openjdk · reinrich · Oct 6, 2025 · Oct 7, 2025 · Sep 25, 2025 · Oct 21, 2025
diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -2007,6 +2007,9 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r
 
   if (bottom_type()->isa_vect() && !bottom_type()->isa_vectmask()) {
     uint ireg = ideal_reg();
+    DEBUG_ONLY(int algm = MIN2(RegMask::num_registers(ireg), (int)Matcher::stack_alignment_in_slots()) * VMRegImpl::stack_slot_size);
+    assert((src_lo_rc != rc_stack) || is_aligned(src_offset, algm), "unaligned vector spill sp offset %d (src)", src_offset);
+    assert((dst_lo_rc != rc_stack) || is_aligned(dst_offset, algm), "unaligned vector spill sp offset %d (dst)", dst_offset);
     if (ireg == Op_VecA && masm) {
       int sve_vector_reg_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
       if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) {

diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad
@@ -1795,69 +1795,77 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r
     return size;            // Self copy, no move.
 
   if (bottom_type()->isa_vect() != nullptr && ideal_reg() == Op_VecX) {
+    int src_offset = ra_->reg2offset(src_lo);
+    int dst_offset = ra_->reg2offset(dst_lo);
+    DEBUG_ONLY(int algm = MIN2(RegMask::num_registers(ideal_reg()), (int)Matcher::stack_alignment_in_slots()) * VMRegImpl::stack_slot_size);
+    assert((src_lo_rc != rc_stack) || is_aligned(src_offset, algm), "unaligned vector spill sp offset %d (src)", src_offset);
+    assert((dst_lo_rc != rc_stack) || is_aligned(dst_offset, algm), "unaligned vector spill sp offset %d (dst)", dst_offset);
     // Memory->Memory Spill.
     if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) {
-      int src_offset = ra_->reg2offset(src_lo);
-      int dst_offset = ra_->reg2offset(dst_lo);
       if (masm) {
         __ ld(R0, src_offset, R1_SP);
         __ std(R0, dst_offset, R1_SP);
         __ ld(R0, src_offset+8, R1_SP);
         __ std(R0, dst_offset+8, R1_SP);
       }
       size += 16;
+#ifndef PRODUCT
+      if (st != nullptr) {
+        st->print("%-7s [R1_SP + #%d] -> [R1_SP + #%d] \t// vector spill copy", "SPILL", src_offset, dst_offset);
+      }
+#endif // !PRODUCT
     }
     // VectorRegister->Memory Spill.
     else if (src_lo_rc == rc_vec && dst_lo_rc == rc_stack) {
       VectorSRegister Rsrc = as_VectorRegister(Matcher::_regEncode[src_lo]).to_vsr();
-      int dst_offset = ra_->reg2offset(dst_lo);
       if (PowerArchitecturePPC64 >= 9) {
-        if (is_aligned(dst_offset, 16)) {
-          if (masm) {
-            __ stxv(Rsrc, dst_offset, R1_SP); // matches storeV16_Power9
-          }
-          size += 4;
-        } else {
-          // Other alignment can be used by Vector API (VectorPayload in rearrangeOp,
-          // observed with VectorRearrangeTest.java on Power9).
-          if (masm) {
-            __ addi(R0, R1_SP, dst_offset);
-            __ stxvx(Rsrc, R0); // matches storeV16_Power9 (regarding element ordering)
-          }
-          size += 8;
+        if (masm) {
+          __ stxv(Rsrc, dst_offset, R1_SP); // matches storeV16_Power9
         }
+        size += 4;
       } else {
         if (masm) {
           __ addi(R0, R1_SP, dst_offset);
           __ stxvd2x(Rsrc, R0); // matches storeV16_Power8
         }
         size += 8;
       }
+#ifndef PRODUCT
+      if (st != nullptr) {
+        if (PowerArchitecturePPC64 >= 9) {
+          st->print("%-7s %s, [R1_SP + #%d] \t// vector spill copy", "STXV", Matcher::regName[src_lo], dst_offset);
+        } else {
+          st->print("%-7s %s, R1_SP, %d \t// vector spill copy", "ADDI", Matcher::regName[src_lo], dst_offset);
+          st->print("%-7s %s, [R1_SP] \t// vector spill copy", "STXVD2X", Matcher::regName[src_lo]);
+        }
+      }
+#endif // !PRODUCT
     }
     // Memory->VectorRegister Spill.
     else if (src_lo_rc == rc_stack && dst_lo_rc == rc_vec) {
       VectorSRegister Rdst = as_VectorRegister(Matcher::_regEncode[dst_lo]).to_vsr();
-      int src_offset = ra_->reg2offset(src_lo);
       if (PowerArchitecturePPC64 >= 9) {
-        if (is_aligned(src_offset, 16)) {
-          if (masm) {
-            __ lxv(Rdst, src_offset, R1_SP);
-          }
-          size += 4;
-        } else {
-          if (masm) {
-            __ addi(R0, R1_SP, src_offset);
-            __ lxvx(Rdst, R0);
-          }
-          size += 8;
+        if (masm) {
+          __ lxv(Rdst, src_offset, R1_SP);
         }
+        size += 4;
       } else {
         if (masm) {
           __ addi(R0, R1_SP, src_offset);
           __ lxvd2x(Rdst, R0);
         }
         size += 8;
       }
+#ifndef PRODUCT
+      if (st != nullptr) {
+        if (PowerArchitecturePPC64 >= 9) {
+          st->print("%-7s %s, [R1_SP + #%d] \t// vector spill copy", "LXV", Matcher::regName[dst_lo], src_offset);
+        } else {
+          st->print("%-7s %s, R1_SP, %d \t// vector spill copy", "ADDI", Matcher::regName[src_lo], src_offset);
+          st->print("%-7s %s, [R1_SP] \t// vector spill copy", "LXVD2X", Matcher::regName[dst_lo]);
+        }
+      }
+#endif // !PRODUCT
     }
     // VectorRegister->VectorRegister.
     else if (src_lo_rc == rc_vec && dst_lo_rc == rc_vec) {
@@ -1867,6 +1875,12 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r
         __ xxlor(Rdst, Rsrc, Rsrc);
       }
       size += 4;
+#ifndef PRODUCT
+      if (st != nullptr) {
+        st->print("%-7s %s, %s, %s\t// vector spill copy",
+                  "XXLOR", Matcher::regName[dst_lo], Matcher::regName[src_lo], Matcher::regName[src_lo]);
+      }
+#endif // !PRODUCT
     }
     else {
       ShouldNotReachHere(); // No VR spill.

diff --git a/src/hotspot/share/opto/chaitin.hpp b/src/hotspot/share/opto/chaitin.hpp
@@ -143,7 +143,7 @@ class LRG : public ResourceObj {
 
 private:
   // Number of registers this live range uses when it colors
-  uint16_t _num_regs;           // 2 for Longs and Doubles, 1 for all else
+  uint16_t _num_regs;           // byte size of the value divided by 4
                                 // except _num_regs is kill count for fat_proj
 
   // For scalable register, num_regs may not be the actual physical register size.

diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp
@@ -283,13 +283,12 @@ void Matcher::match( ) {
     _parm_regs[i].set_pair(reg2, reg1);
   }
 
-  // Finally, make sure the incoming arguments take up an even number of
-  // words, in case the arguments or locals need to contain doubleword stack
-  // slots.  The rest of the system assumes that stack slot pairs (in
-  // particular, in the spill area) which look aligned will in fact be
-  // aligned relative to the stack pointer in the target machine.  Double
-  // stack slots will always be allocated aligned.
-  _new_SP = OptoReg::Name(align_up(_in_arg_limit, (int)RegMask::SlotsPerLong));
+  // Allocated register sets are aligned to their size. Offsets to the stack
+  // pointer have to be aligned to the size of the access. For this _new_SP is
+  // aligned to the size of the largest register set with the stack alignment as
+  // limit and a minimum of SlotsPerLong (2).
+  int vector_aligment = MIN2(C->max_vector_size(), stack_alignment_in_bytes()) / VMRegImpl::stack_slot_size;
+  _new_SP = OptoReg::Name(align_up(_in_arg_limit, MAX2((int)RegMask::SlotsPerLong, vector_aligment)));
 
   // Compute highest outgoing stack argument as
   //   _new_SP + out_preserve_stack_slots + max(outgoing argument size).

diff --git a/src/hotspot/share/opto/regmask.hpp b/src/hotspot/share/opto/regmask.hpp
@@ -354,16 +354,12 @@ class RegMask {
   }
 
   // SlotsPerLong is 2, since slots are 32 bits and longs are 64 bits.
-  // Also, consider the maximum alignment size for a normally allocated
-  // value.  Since we allocate register pairs but not register quads (at
-  // present), this alignment is SlotsPerLong (== 2).  A normally
-  // aligned allocated register is either a single register, or a pair
-  // of adjacent registers, the lower-numbered being even.
-  // See also is_aligned_Pairs() below, and the padding added before
-  // Matcher::_new_SP to keep allocated pairs aligned properly.
-  // If we ever go to quad-word allocations, SlotsPerQuad will become
-  // the controlling alignment constraint.  Note that this alignment
-  // requirement is internal to the allocator, and independent of any
+  // We allocate single registers for 32 bit values and register pairs for 64
+  // bit values. The number of registers allocated for vectors match their size. E.g. for 128 bit
+  // vectors (VecX) we allocate a set of 4 registers. Allocated sets are adjacent and aligned.
+  // See RegMask::find_first_set(), is_aligned_pairs(), is_aligned_sets(), and the padding added before
+  // Matcher::_new_SP to keep allocated pairs and sets aligned properly.
+  // Note that this alignment requirement is internal to the allocator, and independent of any
   // particular platform.
   enum { SlotsPerLong = 2,
          SlotsPerVecA = 4,

diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -1257,6 +1257,12 @@ public class IRNode {
         machOnly(MEM_TO_REG_SPILL_COPY, "MemToRegSpillCopy");
     }
 
+    public static final String MEM_TO_REG_SPILL_COPY_TYPE = COMPOSITE_PREFIX + "MEM_TO_REG_SPILL_COPY_TYPE" + POSTFIX;
+    static {
+        String regex = START + "MemToRegSpillCopy" + MID + IS_REPLACED + ".*" + END;
+        machOnly(MEM_TO_REG_SPILL_COPY_TYPE, regex);
+    }
+
     public static final String MIN = PREFIX + "MIN" + POSTFIX;
     static {
         beforeMatchingNameRegex(MIN, "Min(I|L)");

diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorSpilling.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorSpilling.java
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2025 SAP SE. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.ir_framework.*;
+
+import jdk.incubator.vector.IntVector;
+import jdk.incubator.vector.VectorSpecies;
+
+import jdk.test.lib.Asserts;
+
+/**
+ * @test
+ * @bug 8370473
+ * @library /test/lib /
+ * @summary Test alignment of vector spill slots. It should match the vector size.
+ * @modules jdk.incubator.vector
+ * @requires vm.opt.final.MaxVectorSize == null | vm.opt.final.MaxVectorSize >= 16
+ *
+ * @run driver compiler.vectorapi.TestVectorSpilling
+ */
+
+public class TestVectorSpilling {
+
+    private static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_128;
+    private static int LENGTH = 1024;
+
+    private static int[] ia1;
+    private static int[] ia2;
+    private static int[] ir ;
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector");
+    }
+
+    static class LData {
+        // Rading from a volatile field prevents cse optimization
+        static volatile long vF = 1042;
+
+        long l1, l2, l3, l4, l5, l6, l7, l8;
+        public LData() {
+            l1 = vF; l2 = vF; l3 = vF; l4 = vF; l5 = vF; l6 = vF; l7 = vF; l8 = vF;
+        }
+        public long sum() {
+            return l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8;
+        }
+    }
+
+
+    @Run(test = "test16ByteSpilling")
+    static void test16ByteSpilling_runner() {
+        test16ByteSpilling(1, 2, 3, 4, 5, 6, 7, 8, 9);
+    }
+
+    @Test
+    @IR(counts = {IRNode.MEM_TO_REG_SPILL_COPY_TYPE, "vectorx", "> 0"},
+        phase = {CompilePhase.FINAL_CODE})
+    static long test16ByteSpilling(long l1, long l2, long l3, long l4, long l5, long l6, long l7, long l8,
+                                   long l9 /* odd stack arg */) {
+        // To be scalar replaced and spilled to stack
+        LData d1 = new LData();
+        LData d2 = new LData();
+        LData d3 = new LData();
+
+        for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
+            IntVector a1v = IntVector.fromArray(I_SPECIES, ia1, i);
+            IntVector a2v = IntVector.fromArray(I_SPECIES, ia2, i);
+            int scalar = spillPoint();
+            a1v.add(a2v)
+               .add(scalar).intoArray(ir, i);
+        }
+
+        return l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9 + d1.sum() + d2.sum() + d3.sum();
+    }
+
+    @DontInline
+    static int spillPoint() {
+        return 42;
+    }
+
+    static {
+        ia1 = new int[LENGTH];
+        ia2 = new int[LENGTH];
+        ir  = new int[LENGTH];
+    }
+
+}