Skip to content

Commit 98ee279

Browse files
committed
8365991: AArch64: Ignore BlockZeroingLowLimit when UseBlockZeroing is false
Signed-off-by: Patrick Zhang <patrick@os.amperecomputing.com>
1 parent e1c58f8 commit 98ee279

File tree

4 files changed

+118
-64
lines changed

4 files changed

+118
-64
lines changed

src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6199,23 +6199,22 @@ const int MacroAssembler::zero_words_block_size = 8;
61996199
// ptr, cnt, rscratch1, and rscratch2 are clobbered.
62006200
address MacroAssembler::zero_words(Register ptr, Register cnt)
62016201
{
6202-
assert(is_power_of_2(zero_words_block_size), "adjust this");
6203-
62046202
BLOCK_COMMENT("zero_words {");
6203+
assert(is_power_of_2(zero_words_block_size), "adjust this");
62056204
assert(ptr == r10 && cnt == r11, "mismatch in register usage");
6206-
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
6207-
assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
62086205

62096206
subs(rscratch1, cnt, zero_words_block_size);
62106207
Label around;
62116208
br(LO, around);
6212-
{
6209+
6210+
if (UseBlockZeroing) {
6211+
// Try to zero blocks using DC ZVA if allowed.
62136212
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
62146213
assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
6215-
// Make sure this is a C2 compilation. C1 allocates space only for
6216-
// trampoline stubs generated by Call LIR ops, and in any case it
6217-
// makes sense for a C1 compilation task to proceed as quickly as
6218-
// possible.
6214+
// Ensure that this is a C2 compilation.
6215+
// The C1 compiler allocates space solely for trampoline stubs generated
6216+
// by Call LIR operations, and in any case, it is preferable
6217+
// for a C1 compilation task to complete as quickly as possible.
62196218
CompileTask* task;
62206219
if (StubRoutines::aarch64::complete()
62216220
&& Thread::current()->is_Compiler_thread()
@@ -6230,10 +6229,27 @@ address MacroAssembler::zero_words(Register ptr, Register cnt)
62306229
far_call(zero_blocks);
62316230
}
62326231
}
6232+
6233+
// Process words with length exceeding the predefined block size threshold.
6234+
// The loop body will be unrolled based on the number of STP instructions calculated below.
6235+
const int unroll = zero_words_block_size / 2;
6236+
Label done, loop;
6237+
subs(cnt, cnt, unroll * 2);
6238+
br(Assembler::LT, done);
6239+
bind(loop);
6240+
for (int i = 0; i < unroll; i++)
6241+
stp(zr, zr, post(ptr, 16));
6242+
subs(cnt, cnt, unroll * 2);
6243+
br(Assembler::GE, loop);
6244+
bind(done);
6245+
add(cnt, cnt, unroll * 2);
6246+
62336247
bind(around);
62346248

6235-
// We have a few words left to do. zero_blocks has adjusted r10 and r11
6236-
// for us.
6249+
// A few words remain to complete.
6250+
// If called, the zero_blocks routine has already performed the necessary
6251+
// adjustments to registers r10 and r11, ensuring they are correctly set
6252+
// for subsequent processing.
62376253
for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
62386254
Label l;
62396255
tbz(cnt, exact_log2(i), l);
@@ -6259,47 +6275,48 @@ address MacroAssembler::zero_words(Register ptr, Register cnt)
62596275
// r10, r11, rscratch1, and rscratch2 are clobbered.
62606276
address MacroAssembler::zero_words(Register base, uint64_t cnt)
62616277
{
6262-
assert(wordSize <= BlockZeroingLowLimit,
6263-
"increase BlockZeroingLowLimit");
6278+
assert(wordSize <= BlockZeroingLowLimit, "increase BlockZeroingLowLimit");
62646279
address result = nullptr;
6265-
if (cnt <= (uint64_t)BlockZeroingLowLimit / BytesPerWord) {
6280+
if (UseBlockZeroing && cnt > (uint64_t)BlockZeroingLowLimit / BytesPerWord) {
6281+
mov(r10, base); mov(r11, cnt);
6282+
result = zero_words(r10, r11);
6283+
} else {
62666284
#ifndef PRODUCT
62676285
{
62686286
char buf[64];
62696287
snprintf(buf, sizeof buf, "zero_words (count = %" PRIu64 ") {", cnt);
62706288
BLOCK_COMMENT(buf);
62716289
}
62726290
#endif
6273-
if (cnt >= 16) {
6274-
uint64_t loops = cnt/16;
6291+
const int block_size = 16;
6292+
if (cnt >= block_size) {
6293+
uint64_t loops = cnt/block_size;
62756294
if (loops > 1) {
62766295
mov(rscratch2, loops - 1);
62776296
}
62786297
{
62796298
Label loop;
62806299
bind(loop);
6281-
for (int i = 0; i < 16; i += 2) {
6300+
for (int i = 0; i < block_size; i += 2) {
62826301
stp(zr, zr, Address(base, i * BytesPerWord));
62836302
}
6284-
add(base, base, 16 * BytesPerWord);
6303+
add(base, base, block_size * BytesPerWord);
62856304
if (loops > 1) {
62866305
subs(rscratch2, rscratch2, 1);
62876306
br(GE, loop);
62886307
}
62896308
}
62906309
}
6291-
cnt %= 16;
6310+
cnt %= block_size;
62926311
int i = cnt & 1; // store any odd word to start
62936312
if (i) str(zr, Address(base));
62946313
for (; i < (int)cnt; i += 2) {
62956314
stp(zr, zr, Address(base, i * wordSize));
62966315
}
62976316
BLOCK_COMMENT("} zero_words");
62986317
result = pc();
6299-
} else {
6300-
mov(r10, base); mov(r11, cnt);
6301-
result = zero_words(r10, r11);
63026318
}
6319+
63036320
return result;
63046321
}
63056322

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 21 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -642,9 +642,9 @@ class StubGenerator: public StubCodeGenerator {
642642
return start;
643643
}
644644

645-
// The inner part of zero_words(). This is the bulk operation,
646-
// zeroing words in blocks, possibly using DC ZVA to do it. The
647-
// caller is responsible for zeroing the last few words.
645+
// The inner part of zero_words(). This is the bulk operation,
646+
// zeroing words in blocks, using DC ZVA.
647+
// The caller is responsible for zeroing the last few words.
648648
//
649649
// Inputs:
650650
// r10: the HeapWord-aligned base address of an array to zero.
@@ -653,10 +653,8 @@ class StubGenerator: public StubCodeGenerator {
653653
// Returns r10 and r11, adjusted for the caller to clear.
654654
// r10: the base address of the tail of words left to clear.
655655
// r11: the number of words in the tail.
656-
// r11 < MacroAssembler::zero_words_block_size.
657-
656+
// r11 < MAX2(zva_length * 2, (int)BlockZeroingLowLimit)
658657
address generate_zero_blocks() {
659-
Label done;
660658
Label base_aligned;
661659

662660
Register base = r10, cnt = r11;
@@ -666,44 +664,27 @@ class StubGenerator: public StubCodeGenerator {
666664
StubCodeMark mark(this, stub_id);
667665
address start = __ pc();
668666

669-
if (UseBlockZeroing) {
670-
int zva_length = VM_Version::zva_length();
667+
assert (UseBlockZeroing, "only work when UseBlockZeroing is true");
671668

672-
// Ensure ZVA length can be divided by 16. This is required by
673-
// the subsequent operations.
674-
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
669+
int zva_length = VM_Version::zva_length();
675670

676-
__ tbz(base, 3, base_aligned);
677-
__ str(zr, Address(__ post(base, 8)));
678-
__ sub(cnt, cnt, 1);
679-
__ bind(base_aligned);
671+
// Ensure ZVA length can be divided by 16. This is required by
672+
// the subsequent operations.
673+
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
680674

681-
// Ensure count >= zva_length * 2 so that it still deserves a zva after
682-
// alignment.
683-
Label small;
684-
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
685-
__ subs(rscratch1, cnt, low_limit >> 3);
686-
__ br(Assembler::LT, small);
687-
__ zero_dcache_blocks(base, cnt);
688-
__ bind(small);
689-
}
675+
__ tbz(base, 3, base_aligned);
676+
__ str(zr, Address(__ post(base, 8)));
677+
__ sub(cnt, cnt, 1);
678+
__ bind(base_aligned);
690679

691-
{
692-
// Number of stp instructions we'll unroll
693-
const int unroll =
694-
MacroAssembler::zero_words_block_size / 2;
695-
// Clear the remaining blocks.
696-
Label loop;
697-
__ subs(cnt, cnt, unroll * 2);
698-
__ br(Assembler::LT, done);
699-
__ bind(loop);
700-
for (int i = 0; i < unroll; i++)
701-
__ stp(zr, zr, __ post(base, 16));
702-
__ subs(cnt, cnt, unroll * 2);
703-
__ br(Assembler::GE, loop);
704-
__ bind(done);
705-
__ add(cnt, cnt, unroll * 2);
706-
}
680+
// Ensure count >= zva_length * 2 so that it still deserves a zva after
681+
// alignment.
682+
Label small;
683+
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
684+
__ subs(rscratch1, cnt, low_limit >> 3);
685+
__ br(Assembler::LT, small);
686+
__ zero_dcache_blocks(base, cnt);
687+
__ bind(small);
707688

708689
__ ret(lr);
709690

src/hotspot/cpu/aarch64/vm_version_aarch64.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,9 @@ void VM_Version::initialize() {
453453
warning("DC ZVA is not available on this CPU");
454454
FLAG_SET_DEFAULT(UseBlockZeroing, false);
455455
}
456+
if (!UseBlockZeroing && !FLAG_IS_DEFAULT(BlockZeroingLowLimit)) {
457+
warning("BlockZeroingLowLimit will not work when UseBlockZeroing is false");
458+
}
456459

457460
if (VM_Version::supports_sve2()) {
458461
if (FLAG_IS_DEFAULT(UseSVE)) {

test/micro/org/openjdk/bench/vm/gc/RawAllocationRate.java

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
package org.openjdk.bench.vm.gc;
2424

2525
import java.util.concurrent.TimeUnit;
26+
27+
import org.openjdk.bench.vm.gc.RawAllocationRate.BaseClass;
2628
import org.openjdk.jmh.annotations.*;
2729
import org.openjdk.jmh.infra.Blackhole;
2830

@@ -37,7 +39,7 @@
3739
@State(Scope.Thread)
3840
public class RawAllocationRate {
3941

40-
@Param({"32", "64", "256", "1024", "2048", "4096", "8192", "16384", "65536", "131072"}) // Object size in bytes.
42+
@Param({"32", "48", "64", "80", "96", "128", "256", "1024", "2048", "4096", "8192", "16384", "65536", "131072"}) // Object size in bytes.
4143
public int size;
4244

4345
Object[] objects;
@@ -113,11 +115,16 @@ public Object[] instanceTest_C1() {
113115

114116
static class BaseClass {
115117
}
116-
117118
static class Class32 extends BaseClass {
118119
long i0;
119120
long i1;
120121
}
122+
static class Class48 extends BaseClass {
123+
long i0;
124+
long i1;
125+
long i2;
126+
long i3;
127+
}
121128
static class Class64 extends BaseClass {
122129
long i0;
123130
long i1;
@@ -126,6 +133,44 @@ static class Class64 extends BaseClass {
126133
long i4;
127134
long i5;
128135
}
136+
static class Class80 extends BaseClass {
137+
long i0;
138+
long i1;
139+
long i2;
140+
long i3;
141+
long i4;
142+
long i5;
143+
long i6;
144+
long i7;
145+
}
146+
static class Class96 extends BaseClass {
147+
long i0;
148+
long i1;
149+
long i2;
150+
long i3;
151+
long i4;
152+
long i5;
153+
long i6;
154+
long i7;
155+
long i8;
156+
long i9;
157+
}
158+
static class Class128 extends BaseClass {
159+
long i0;
160+
long i1;
161+
long i2;
162+
long i3;
163+
long i4;
164+
long i5;
165+
long i6;
166+
long i7;
167+
long i8;
168+
long i9;
169+
long i10;
170+
long i11;
171+
long i12;
172+
long i13;
173+
}
129174
static class Class256 extends BaseClass {
130175
long i0;
131176
long i1;
@@ -28707,8 +28752,16 @@ static final BaseClass newInstance(int size) {
2870728752
switch (size) {
2870828753
case 32:
2870928754
return new Class32();
28755+
case 48:
28756+
return new Class48();
2871028757
case 64:
2871128758
return new Class64();
28759+
case 80:
28760+
return new Class80();
28761+
case 96:
28762+
return new Class96();
28763+
case 128:
28764+
return new Class128();
2871228765
case 256:
2871328766
return new Class256();
2871428767
case 1024:

0 commit comments

Comments
 (0)