diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 5734519301e28..5cf71ecf04090 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -2007,6 +2007,9 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r if (bottom_type()->isa_vect() && !bottom_type()->isa_vectmask()) { uint ireg = ideal_reg(); + DEBUG_ONLY(int algm = MIN2(RegMask::num_registers(ireg), (int)Matcher::stack_alignment_in_slots()) * VMRegImpl::stack_slot_size); + assert((src_lo_rc != rc_stack) || is_aligned(src_offset, algm), "unaligned vector spill sp offset %d (src)", src_offset); + assert((dst_lo_rc != rc_stack) || is_aligned(dst_offset, algm), "unaligned vector spill sp offset %d (dst)", dst_offset); if (ireg == Op_VecA && masm) { int sve_vector_reg_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index 03dbd0e780ba2..07a73d2eb3665 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -1795,10 +1795,13 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r return size; // Self copy, no move. if (bottom_type()->isa_vect() != nullptr && ideal_reg() == Op_VecX) { + int src_offset = ra_->reg2offset(src_lo); + int dst_offset = ra_->reg2offset(dst_lo); + DEBUG_ONLY(int algm = MIN2(RegMask::num_registers(ideal_reg()), (int)Matcher::stack_alignment_in_slots()) * VMRegImpl::stack_slot_size); + assert((src_lo_rc != rc_stack) || is_aligned(src_offset, algm), "unaligned vector spill sp offset %d (src)", src_offset); + assert((dst_lo_rc != rc_stack) || is_aligned(dst_offset, algm), "unaligned vector spill sp offset %d (dst)", dst_offset); // Memory->Memory Spill. if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { - int src_offset = ra_->reg2offset(src_lo); - int dst_offset = ra_->reg2offset(dst_lo); if (masm) { __ ld(R0, src_offset, R1_SP); __ std(R0, dst_offset, R1_SP); @@ -1806,26 +1809,20 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r __ std(R0, dst_offset+8, R1_SP); } size += 16; +#ifndef PRODUCT + if (st != nullptr) { + st->print("%-7s [R1_SP + #%d] -> [R1_SP + #%d] \t// vector spill copy", "SPILL", src_offset, dst_offset); + } +#endif // !PRODUCT } // VectorRegister->Memory Spill. else if (src_lo_rc == rc_vec && dst_lo_rc == rc_stack) { VectorSRegister Rsrc = as_VectorRegister(Matcher::_regEncode[src_lo]).to_vsr(); - int dst_offset = ra_->reg2offset(dst_lo); if (PowerArchitecturePPC64 >= 9) { - if (is_aligned(dst_offset, 16)) { - if (masm) { - __ stxv(Rsrc, dst_offset, R1_SP); // matches storeV16_Power9 - } - size += 4; - } else { - // Other alignment can be used by Vector API (VectorPayload in rearrangeOp, - // observed with VectorRearrangeTest.java on Power9). - if (masm) { - __ addi(R0, R1_SP, dst_offset); - __ stxvx(Rsrc, R0); // matches storeV16_Power9 (regarding element ordering) - } - size += 8; + if (masm) { + __ stxv(Rsrc, dst_offset, R1_SP); // matches storeV16_Power9 } + size += 4; } else { if (masm) { __ addi(R0, R1_SP, dst_offset); @@ -1833,24 +1830,25 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r } size += 8; } +#ifndef PRODUCT + if (st != nullptr) { + if (PowerArchitecturePPC64 >= 9) { + st->print("%-7s %s, [R1_SP + #%d] \t// vector spill copy", "STXV", Matcher::regName[src_lo], dst_offset); + } else { + st->print("%-7s %s, R1_SP, %d \t// vector spill copy", "ADDI", Matcher::regName[src_lo], dst_offset); + st->print("%-7s %s, [R1_SP] \t// vector spill copy", "STXVD2X", Matcher::regName[src_lo]); + } + } +#endif // !PRODUCT } // Memory->VectorRegister Spill. else if (src_lo_rc == rc_stack && dst_lo_rc == rc_vec) { VectorSRegister Rdst = as_VectorRegister(Matcher::_regEncode[dst_lo]).to_vsr(); - int src_offset = ra_->reg2offset(src_lo); if (PowerArchitecturePPC64 >= 9) { - if (is_aligned(src_offset, 16)) { - if (masm) { - __ lxv(Rdst, src_offset, R1_SP); - } - size += 4; - } else { - if (masm) { - __ addi(R0, R1_SP, src_offset); - __ lxvx(Rdst, R0); - } - size += 8; + if (masm) { + __ lxv(Rdst, src_offset, R1_SP); } + size += 4; } else { if (masm) { __ addi(R0, R1_SP, src_offset); @@ -1858,6 +1856,16 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r } size += 8; } +#ifndef PRODUCT + if (st != nullptr) { + if (PowerArchitecturePPC64 >= 9) { + st->print("%-7s %s, [R1_SP + #%d] \t// vector spill copy", "LXV", Matcher::regName[dst_lo], src_offset); + } else { + st->print("%-7s %s, R1_SP, %d \t// vector spill copy", "ADDI", Matcher::regName[src_lo], src_offset); + st->print("%-7s %s, [R1_SP] \t// vector spill copy", "LXVD2X", Matcher::regName[dst_lo]); + } + } +#endif // !PRODUCT } // VectorRegister->VectorRegister. else if (src_lo_rc == rc_vec && dst_lo_rc == rc_vec) { @@ -1867,6 +1875,12 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r __ xxlor(Rdst, Rsrc, Rsrc); } size += 4; +#ifndef PRODUCT + if (st != nullptr) { + st->print("%-7s %s, %s, %s\t// vector spill copy", + "XXLOR", Matcher::regName[dst_lo], Matcher::regName[src_lo], Matcher::regName[src_lo]); + } +#endif // !PRODUCT } else { ShouldNotReachHere(); // No VR spill. diff --git a/src/hotspot/share/opto/chaitin.hpp b/src/hotspot/share/opto/chaitin.hpp index b477c54fcae49..e2f4196941f0c 100644 --- a/src/hotspot/share/opto/chaitin.hpp +++ b/src/hotspot/share/opto/chaitin.hpp @@ -143,7 +143,7 @@ class LRG : public ResourceObj { private: // Number of registers this live range uses when it colors - uint16_t _num_regs; // 2 for Longs and Doubles, 1 for all else + uint16_t _num_regs; // byte size of the value divided by 4 // except _num_regs is kill count for fat_proj // For scalable register, num_regs may not be the actual physical register size. diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index c63cefe7ac201..729b96775a32d 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -283,13 +283,12 @@ void Matcher::match( ) { _parm_regs[i].set_pair(reg2, reg1); } - // Finally, make sure the incoming arguments take up an even number of - // words, in case the arguments or locals need to contain doubleword stack - // slots. The rest of the system assumes that stack slot pairs (in - // particular, in the spill area) which look aligned will in fact be - // aligned relative to the stack pointer in the target machine. Double - // stack slots will always be allocated aligned. - _new_SP = OptoReg::Name(align_up(_in_arg_limit, (int)RegMask::SlotsPerLong)); + // Allocated register sets are aligned to their size. Offsets to the stack + // pointer have to be aligned to the size of the access. For this _new_SP is + // aligned to the size of the largest register set with the stack alignment as + // limit and a minimum of SlotsPerLong (2). + int vector_aligment = MIN2(C->max_vector_size(), stack_alignment_in_bytes()) / VMRegImpl::stack_slot_size; + _new_SP = OptoReg::Name(align_up(_in_arg_limit, MAX2((int)RegMask::SlotsPerLong, vector_aligment))); // Compute highest outgoing stack argument as // _new_SP + out_preserve_stack_slots + max(outgoing argument size). diff --git a/src/hotspot/share/opto/regmask.hpp b/src/hotspot/share/opto/regmask.hpp index 453fbb45d33b7..421031fdf6154 100644 --- a/src/hotspot/share/opto/regmask.hpp +++ b/src/hotspot/share/opto/regmask.hpp @@ -354,16 +354,12 @@ class RegMask { } // SlotsPerLong is 2, since slots are 32 bits and longs are 64 bits. - // Also, consider the maximum alignment size for a normally allocated - // value. Since we allocate register pairs but not register quads (at - // present), this alignment is SlotsPerLong (== 2). A normally - // aligned allocated register is either a single register, or a pair - // of adjacent registers, the lower-numbered being even. - // See also is_aligned_Pairs() below, and the padding added before - // Matcher::_new_SP to keep allocated pairs aligned properly. - // If we ever go to quad-word allocations, SlotsPerQuad will become - // the controlling alignment constraint. Note that this alignment - // requirement is internal to the allocator, and independent of any + // We allocate single registers for 32 bit values and register pairs for 64 + // bit values. The number of registers allocated for vectors match their size. E.g. for 128 bit + // vectors (VecX) we allocate a set of 4 registers. Allocated sets are adjacent and aligned. + // See RegMask::find_first_set(), is_aligned_pairs(), is_aligned_sets(), and the padding added before + // Matcher::_new_SP to keep allocated pairs and sets aligned properly. + // Note that this alignment requirement is internal to the allocator, and independent of any // particular platform. enum { SlotsPerLong = 2, SlotsPerVecA = 4, diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index f0f7aaf383615..665e3e52c0f31 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -1257,6 +1257,12 @@ public class IRNode { machOnly(MEM_TO_REG_SPILL_COPY, "MemToRegSpillCopy"); } + public static final String MEM_TO_REG_SPILL_COPY_TYPE = COMPOSITE_PREFIX + "MEM_TO_REG_SPILL_COPY_TYPE" + POSTFIX; + static { + String regex = START + "MemToRegSpillCopy" + MID + IS_REPLACED + ".*" + END; + machOnly(MEM_TO_REG_SPILL_COPY_TYPE, regex); + } + public static final String MIN = PREFIX + "MIN" + POSTFIX; static { beforeMatchingNameRegex(MIN, "Min(I|L)"); diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorSpilling.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorSpilling.java new file mode 100644 index 0000000000000..5e8b9341d8e3a --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorSpilling.java @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2025 SAP SE. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +package compiler.vectorapi; + +import compiler.lib.ir_framework.*; + +import jdk.incubator.vector.IntVector; +import jdk.incubator.vector.VectorSpecies; + +import jdk.test.lib.Asserts; + +/** + * @test + * @bug 8370473 + * @library /test/lib / + * @summary Test alignment of vector spill slots. It should match the vector size. + * @modules jdk.incubator.vector + * @requires vm.opt.final.MaxVectorSize == null | vm.opt.final.MaxVectorSize >= 16 + * + * @run driver compiler.vectorapi.TestVectorSpilling + */ + +public class TestVectorSpilling { + + private static final VectorSpecies I_SPECIES = IntVector.SPECIES_128; + private static int LENGTH = 1024; + + private static int[] ia1; + private static int[] ia2; + private static int[] ir ; + + public static void main(String[] args) { + TestFramework.runWithFlags("--add-modules=jdk.incubator.vector"); + } + + static class LData { + // Rading from a volatile field prevents cse optimization + static volatile long vF = 1042; + + long l1, l2, l3, l4, l5, l6, l7, l8; + public LData() { + l1 = vF; l2 = vF; l3 = vF; l4 = vF; l5 = vF; l6 = vF; l7 = vF; l8 = vF; + } + public long sum() { + return l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8; + } + } + + + @Run(test = "test16ByteSpilling") + static void test16ByteSpilling_runner() { + test16ByteSpilling(1, 2, 3, 4, 5, 6, 7, 8, 9); + } + + @Test + @IR(counts = {IRNode.MEM_TO_REG_SPILL_COPY_TYPE, "vectorx", "> 0"}, + phase = {CompilePhase.FINAL_CODE}) + static long test16ByteSpilling(long l1, long l2, long l3, long l4, long l5, long l6, long l7, long l8, + long l9 /* odd stack arg */) { + // To be scalar replaced and spilled to stack + LData d1 = new LData(); + LData d2 = new LData(); + LData d3 = new LData(); + + for (int i = 0; i < LENGTH; i += I_SPECIES.length()) { + IntVector a1v = IntVector.fromArray(I_SPECIES, ia1, i); + IntVector a2v = IntVector.fromArray(I_SPECIES, ia2, i); + int scalar = spillPoint(); + a1v.add(a2v) + .add(scalar).intoArray(ir, i); + } + + return l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + l9 + d1.sum() + d2.sum() + d3.sum(); + } + + @DontInline + static int spillPoint() { + return 42; + } + + static { + ia1 = new int[LENGTH]; + ia2 = new int[LENGTH]; + ir = new int[LENGTH]; + } + +}