Skip to content

Commit

Permalink
8255246: AArch64: Implement BigInteger shiftRight and shiftLeft accel…
Browse files Browse the repository at this point in the history
…erator/intrinsic

Reviewed-by: aph
  • Loading branch information
Dong Bo authored and RealFYang committed Oct 28, 2020
1 parent 591e7e2 commit 6b2d11b
Show file tree
Hide file tree
Showing 3 changed files with 275 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/hotspot/cpu/aarch64/globals_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ define_pd_global(intx, InlineSmallCode, 1000);
"Use SIMD instructions in generated array equals code") \
product(bool, UseSimpleArrayEquals, false, \
"Use simpliest and shortest implementation for array equals") \
product(bool, UseSIMDForBigIntegerShiftIntrinsics, true, \
"Use SIMD instructions for left/right shift of BigInteger") \
product(bool, AvoidUnalignedAccesses, false, \
"Avoid generating unaligned memory accesses") \
product(bool, UseLSE, false, \
Expand Down
237 changes: 237 additions & 0 deletions src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3968,6 +3968,238 @@ class StubGenerator: public StubCodeGenerator {
return start;
}

// Arguments:
//
// Input:
// c_rarg0 - newArr address
// c_rarg1 - oldArr address
// c_rarg2 - newIdx
// c_rarg3 - shiftCount
// c_rarg4 - numIter
//
address generate_bigIntegerRightShift() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
address start = __ pc();

Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;

Register newArr = c_rarg0;
Register oldArr = c_rarg1;
Register newIdx = c_rarg2;
Register shiftCount = c_rarg3;
Register numIter = c_rarg4;
Register idx = numIter;

Register newArrCur = rscratch1;
Register shiftRevCount = rscratch2;
Register oldArrCur = r13;
Register oldArrNext = r14;

FloatRegister oldElem0 = v0;
FloatRegister oldElem1 = v1;
FloatRegister newElem = v2;
FloatRegister shiftVCount = v3;
FloatRegister shiftVRevCount = v4;

__ cbz(idx, Exit);

__ add(newArr, newArr, newIdx, Assembler::LSL, 2);

// left shift count
__ movw(shiftRevCount, 32);
__ subw(shiftRevCount, shiftRevCount, shiftCount);

// numIter too small to allow a 4-words SIMD loop, rolling back
__ cmp(numIter, (u1)4);
__ br(Assembler::LT, ShiftThree);

__ dup(shiftVCount, __ T4S, shiftCount);
__ dup(shiftVRevCount, __ T4S, shiftRevCount);
__ negr(shiftVCount, __ T4S, shiftVCount);

__ BIND(ShiftSIMDLoop);

// Calculate the load addresses
__ sub(idx, idx, 4);
__ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
__ add(newArrCur, newArr, idx, Assembler::LSL, 2);
__ add(oldArrCur, oldArrNext, 4);

// Load 4 words and process
__ ld1(oldElem0, __ T4S, Address(oldArrCur));
__ ld1(oldElem1, __ T4S, Address(oldArrNext));
__ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
__ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
__ orr(newElem, __ T16B, oldElem0, oldElem1);
__ st1(newElem, __ T4S, Address(newArrCur));

__ cmp(idx, (u1)4);
__ br(Assembler::LT, ShiftTwoLoop);
__ b(ShiftSIMDLoop);

__ BIND(ShiftTwoLoop);
__ cbz(idx, Exit);
__ cmp(idx, (u1)1);
__ br(Assembler::EQ, ShiftOne);

// Calculate the load addresses
__ sub(idx, idx, 2);
__ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
__ add(newArrCur, newArr, idx, Assembler::LSL, 2);
__ add(oldArrCur, oldArrNext, 4);

// Load 2 words and process
__ ld1(oldElem0, __ T2S, Address(oldArrCur));
__ ld1(oldElem1, __ T2S, Address(oldArrNext));
__ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
__ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
__ orr(newElem, __ T8B, oldElem0, oldElem1);
__ st1(newElem, __ T2S, Address(newArrCur));
__ b(ShiftTwoLoop);

__ BIND(ShiftThree);
__ tbz(idx, 1, ShiftOne);
__ tbz(idx, 0, ShiftTwo);
__ ldrw(r10, Address(oldArr, 12));
__ ldrw(r11, Address(oldArr, 8));
__ lsrvw(r10, r10, shiftCount);
__ lslvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, Address(newArr, 8));

__ BIND(ShiftTwo);
__ ldrw(r10, Address(oldArr, 8));
__ ldrw(r11, Address(oldArr, 4));
__ lsrvw(r10, r10, shiftCount);
__ lslvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, Address(newArr, 4));

__ BIND(ShiftOne);
__ ldrw(r10, Address(oldArr, 4));
__ ldrw(r11, Address(oldArr));
__ lsrvw(r10, r10, shiftCount);
__ lslvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, Address(newArr));

__ BIND(Exit);
__ ret(lr);

return start;
}

// Arguments:
//
// Input:
// c_rarg0 - newArr address
// c_rarg1 - oldArr address
// c_rarg2 - newIdx
// c_rarg3 - shiftCount
// c_rarg4 - numIter
//
address generate_bigIntegerLeftShift() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
address start = __ pc();

Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;

Register newArr = c_rarg0;
Register oldArr = c_rarg1;
Register newIdx = c_rarg2;
Register shiftCount = c_rarg3;
Register numIter = c_rarg4;

Register shiftRevCount = rscratch1;
Register oldArrNext = rscratch2;

FloatRegister oldElem0 = v0;
FloatRegister oldElem1 = v1;
FloatRegister newElem = v2;
FloatRegister shiftVCount = v3;
FloatRegister shiftVRevCount = v4;

__ cbz(numIter, Exit);

__ add(oldArrNext, oldArr, 4);
__ add(newArr, newArr, newIdx, Assembler::LSL, 2);

// right shift count
__ movw(shiftRevCount, 32);
__ subw(shiftRevCount, shiftRevCount, shiftCount);

// numIter too small to allow a 4-words SIMD loop, rolling back
__ cmp(numIter, (u1)4);
__ br(Assembler::LT, ShiftThree);

__ dup(shiftVCount, __ T4S, shiftCount);
__ dup(shiftVRevCount, __ T4S, shiftRevCount);
__ negr(shiftVRevCount, __ T4S, shiftVRevCount);

__ BIND(ShiftSIMDLoop);

// load 4 words and process
__ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
__ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
__ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
__ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
__ orr(newElem, __ T16B, oldElem0, oldElem1);
__ st1(newElem, __ T4S, __ post(newArr, 16));
__ sub(numIter, numIter, 4);

__ cmp(numIter, (u1)4);
__ br(Assembler::LT, ShiftTwoLoop);
__ b(ShiftSIMDLoop);

__ BIND(ShiftTwoLoop);
__ cbz(numIter, Exit);
__ cmp(numIter, (u1)1);
__ br(Assembler::EQ, ShiftOne);

// load 2 words and process
__ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
__ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
__ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
__ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
__ orr(newElem, __ T8B, oldElem0, oldElem1);
__ st1(newElem, __ T2S, __ post(newArr, 8));
__ sub(numIter, numIter, 2);
__ b(ShiftTwoLoop);

__ BIND(ShiftThree);
__ ldrw(r10, __ post(oldArr, 4));
__ ldrw(r11, __ post(oldArrNext, 4));
__ lslvw(r10, r10, shiftCount);
__ lsrvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, __ post(newArr, 4));
__ tbz(numIter, 1, Exit);
__ tbz(numIter, 0, ShiftOne);

__ BIND(ShiftTwo);
__ ldrw(r10, __ post(oldArr, 4));
__ ldrw(r11, __ post(oldArrNext, 4));
__ lslvw(r10, r10, shiftCount);
__ lsrvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, __ post(newArr, 4));

__ BIND(ShiftOne);
__ ldrw(r10, Address(oldArr));
__ ldrw(r11, Address(oldArrNext));
__ lslvw(r10, r10, shiftCount);
__ lsrvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, Address(newArr));

__ BIND(Exit);
__ ret(lr);

return start;
}

void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
Expand Down Expand Up @@ -6224,6 +6456,11 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_mulAdd = generate_mulAdd();
}

if (UseSIMDForBigIntegerShiftIntrinsics) {
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
}

if (UseMontgomeryMultiplyIntrinsic) {
StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
Expand Down
38 changes: 36 additions & 2 deletions test/micro/org/openjdk/bench/java/math/BigIntegers.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -34,6 +34,7 @@
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.infra.Blackhole;

import java.math.BigInteger;
Expand All @@ -45,11 +46,14 @@
@State(Scope.Thread)
public class BigIntegers {

private BigInteger[] hugeArray, largeArray, smallArray, shiftArray;
private BigInteger[] hugeArray, largeArray, smallArray, shiftArray, smallShiftArray;
public String[] dummyStringArray;
public Object[] dummyArr;
private static final int TESTSIZE = 1000;

@Param({"32", "64", "96", "128", "160", "192", "224", "256"})
private int maxNumbits;

@Setup
public void setup() {
Random r = new Random(1123);
Expand All @@ -72,6 +76,9 @@ public void setup() {
* Each array entry is atmost 16k bits
* in size
*/
smallShiftArray = new BigInteger[TESTSIZE]; /*
* Small numbers, bits count in range [maxNumbits - 31, maxNumbits]
*/

dummyStringArray = new String[TESTSIZE];
dummyArr = new Object[TESTSIZE];
Expand All @@ -84,6 +91,7 @@ public void setup() {
largeArray[i] = new BigInteger("" + ((long) value + (long) Integer.MAX_VALUE));
smallArray[i] = new BigInteger("" + ((long) value / 1000));
shiftArray[i] = new BigInteger(numbits, r);
smallShiftArray[i] = new BigInteger(Math.max(maxNumbits - value % 32, 0), r);
}
}

Expand Down Expand Up @@ -177,4 +185,30 @@ public void testRightShift(Blackhole bh) {
}
bh.consume(tmp);
}

/** Invokes the shiftLeft method of small BigInteger with different values. */
@Benchmark
@OperationsPerInvocation(TESTSIZE)
public void testSmallLeftShift(Blackhole bh) {
Random rand = new Random();
int shift = rand.nextInt(30) + 1;
BigInteger tmp = null;
for (BigInteger s : smallShiftArray) {
tmp = s.shiftLeft(shift);
bh.consume(tmp);
}
}

/** Invokes the shiftRight method of small BigInteger with different values. */
@Benchmark
@OperationsPerInvocation(TESTSIZE)
public void testSmallRightShift(Blackhole bh) {
Random rand = new Random();
int shift = rand.nextInt(30) + 1;
BigInteger tmp = null;
for (BigInteger s : smallShiftArray) {
tmp = s.shiftRight(shift);
bh.consume(tmp);
}
}
}

1 comment on commit 6b2d11b

@bridgekeeper
Copy link

@bridgekeeper bridgekeeper bot commented on 6b2d11b Oct 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.