Skip to content

Commit 14c18f7

Browse files
committed
Roll back main changes on zero_words_reg_reg and generate_zero_blocks
Signed-off-by: Patrick Zhang <patrick@os.amperecomputing.com>
1 parent 98ee279 commit 14c18f7

File tree

2 files changed

+52
-47
lines changed

2 files changed

+52
-47
lines changed

src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6199,22 +6199,21 @@ const int MacroAssembler::zero_words_block_size = 8;
61996199
// ptr, cnt, rscratch1, and rscratch2 are clobbered.
62006200
address MacroAssembler::zero_words(Register ptr, Register cnt)
62016201
{
6202-
BLOCK_COMMENT("zero_words {");
62036202
assert(is_power_of_2(zero_words_block_size), "adjust this");
6203+
6204+
BLOCK_COMMENT("zero_words {");
62046205
assert(ptr == r10 && cnt == r11, "mismatch in register usage");
62056206

62066207
subs(rscratch1, cnt, zero_words_block_size);
62076208
Label around;
62086209
br(LO, around);
6209-
6210-
if (UseBlockZeroing) {
6211-
// Try to zero blocks using DC ZVA if allowed.
6210+
{
62126211
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
62136212
assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
6214-
// Ensure that this is a C2 compilation.
6215-
// The C1 compiler allocates space solely for trampoline stubs generated
6216-
// by Call LIR operations, and in any case, it is preferable
6217-
// for a C1 compilation task to complete as quickly as possible.
6213+
// Make sure this is a C2 compilation. C1 allocates space only for
6214+
// trampoline stubs generated by Call LIR ops, and in any case it
6215+
// makes sense for a C1 compilation task to proceed as quickly as
6216+
// possible.
62186217
CompileTask* task;
62196218
if (StubRoutines::aarch64::complete()
62206219
&& Thread::current()->is_Compiler_thread()
@@ -6229,26 +6228,11 @@ address MacroAssembler::zero_words(Register ptr, Register cnt)
62296228
far_call(zero_blocks);
62306229
}
62316230
}
6232-
6233-
// Process words with length exceeding the predefined block size threshold.
6234-
// The loop body will be unrolled based on the number of STP instructions calculated below.
6235-
const int unroll = zero_words_block_size / 2;
6236-
Label done, loop;
6237-
subs(cnt, cnt, unroll * 2);
6238-
br(Assembler::LT, done);
6239-
bind(loop);
6240-
for (int i = 0; i < unroll; i++)
6241-
stp(zr, zr, post(ptr, 16));
6242-
subs(cnt, cnt, unroll * 2);
6243-
br(Assembler::GE, loop);
6244-
bind(done);
6245-
add(cnt, cnt, unroll * 2);
6246-
62476231
bind(around);
62486232

62496233
// A few words remain to complete.
6250-
// If called, the zero_blocks routine has already performed the necessary
6251-
// adjustments to registers r10 and r11, ensuring they are correctly set
6234+
// The zero_blocks routine has already performed the necessary
6235+
// adjustments to r10 and r11, ensuring they are correctly set
62526236
// for subsequent processing.
62536237
for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
62546238
Label l;
@@ -6288,6 +6272,8 @@ address MacroAssembler::zero_words(Register base, uint64_t cnt)
62886272
BLOCK_COMMENT(buf);
62896273
}
62906274
#endif
6275+
// Use 16 words as the block size which is 128 bytes on 64-bit systems.
6276+
// A complete loop body will be 8 STPs unrolled there.
62916277
const int block_size = 16;
62926278
if (cnt >= block_size) {
62936279
uint64_t loops = cnt/block_size;

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 41 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -642,9 +642,9 @@ class StubGenerator: public StubCodeGenerator {
642642
return start;
643643
}
644644

645-
// The inner part of zero_words(). This is the bulk operation,
646-
// zeroing words in blocks, using DC ZVA.
647-
// The caller is responsible for zeroing the last few words.
645+
// The inner part of zero_words(). This is the bulk operation,
646+
// zeroing words in blocks, possibly using DC ZVA to do it. The
647+
// caller is responsible for zeroing the last few words.
648648
//
649649
// Inputs:
650650
// r10: the HeapWord-aligned base address of an array to zero.
@@ -653,8 +653,10 @@ class StubGenerator: public StubCodeGenerator {
653653
// Returns r10 and r11, adjusted for the caller to clear.
654654
// r10: the base address of the tail of words left to clear.
655655
// r11: the number of words in the tail.
656-
// r11 < MAX2(zva_length * 2, (int)BlockZeroingLowLimit)
656+
// r11 < MacroAssembler::zero_words_block_size.
657+
657658
address generate_zero_blocks() {
659+
Label done;
658660
Label base_aligned;
659661

660662
Register base = r10, cnt = r11;
@@ -664,34 +666,51 @@ class StubGenerator: public StubCodeGenerator {
664666
StubCodeMark mark(this, stub_id);
665667
address start = __ pc();
666668

667-
assert (UseBlockZeroing, "only work when UseBlockZeroing is true");
669+
if (UseBlockZeroing) {
670+
int zva_length = VM_Version::zva_length();
668671

669-
int zva_length = VM_Version::zva_length();
672+
// Ensure ZVA length can be divided by 16. This is required by
673+
// the subsequent operations.
674+
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
670675

671-
// Ensure ZVA length can be divided by 16. This is required by
672-
// the subsequent operations.
673-
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
676+
__ tbz(base, 3, base_aligned);
677+
__ str(zr, Address(__ post(base, 8)));
678+
__ sub(cnt, cnt, 1);
679+
__ bind(base_aligned);
674680

675-
__ tbz(base, 3, base_aligned);
676-
__ str(zr, Address(__ post(base, 8)));
677-
__ sub(cnt, cnt, 1);
678-
__ bind(base_aligned);
681+
// Ensure count >= zva_length * 2 so that it still deserves a zva after
682+
// alignment.
683+
Label small;
684+
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
685+
__ subs(rscratch1, cnt, low_limit >> 3);
686+
__ br(Assembler::LT, small);
687+
__ zero_dcache_blocks(base, cnt);
688+
__ bind(small);
689+
}
679690

680-
// Ensure count >= zva_length * 2 so that it still deserves a zva after
681-
// alignment.
682-
Label small;
683-
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
684-
__ subs(rscratch1, cnt, low_limit >> 3);
685-
__ br(Assembler::LT, small);
686-
__ zero_dcache_blocks(base, cnt);
687-
__ bind(small);
691+
{
692+
// Process words with length exceeding the predefined
693+
// block size threshold. The loop body will be unrolled based on
694+
// the number of STPs calculated below.
695+
const int unroll = MacroAssembler::zero_words_block_size / 2;
696+
// Clear the remaining blocks.
697+
Label loop;
698+
__ subs(cnt, cnt, unroll * 2);
699+
__ br(Assembler::LT, done);
700+
__ bind(loop);
701+
for (int i = 0; i < unroll; i++)
702+
__ stp(zr, zr, __ post(base, 16));
703+
__ subs(cnt, cnt, unroll * 2);
704+
__ br(Assembler::GE, loop);
705+
__ bind(done);
706+
__ add(cnt, cnt, unroll * 2);
707+
}
688708

689709
__ ret(lr);
690710

691711
return start;
692712
}
693713

694-
695714
typedef enum {
696715
copy_forwards = 1,
697716
copy_backwards = -1

0 commit comments

Comments
 (0)