Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8255246: AArch64: Implement BigInteger shiftRight and shiftLeft accelerator/intrinsic #861

Closed
wants to merge 11 commits into from
2 changes: 2 additions & 0 deletions src/hotspot/cpu/aarch64/globals_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ define_pd_global(intx, InlineSmallCode, 1000);
"Use SIMD instructions in generated array equals code") \
product(bool, UseSimpleArrayEquals, false, \
"Use simpliest and shortest implementation for array equals") \
product(bool, UseSIMDForBigIntegerShiftIntrinsics, true, \
"Use SIMD instructions for left/right shift of BigInteger") \
product(bool, AvoidUnalignedAccesses, false, \
"Avoid generating unaligned memory accesses") \
product(bool, UseLSE, false, \
Expand Down
237 changes: 237 additions & 0 deletions src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3968,6 +3968,238 @@ class StubGenerator: public StubCodeGenerator {
return start;
}

// Arguments:
//
// Input:
// c_rarg0 - newArr address
// c_rarg1 - oldArr address
// c_rarg2 - newIdx
// c_rarg3 - shiftCount
// c_rarg4 - numIter
//
address generate_bigIntegerRightShift() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker");
address start = __ pc();

Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;

Register newArr = c_rarg0;
Register oldArr = c_rarg1;
Register newIdx = c_rarg2;
Register shiftCount = c_rarg3;
Register numIter = c_rarg4;
Register idx = numIter;

Register newArrCur = rscratch1;
Register shiftRevCount = rscratch2;
Register oldArrCur = r13;
Register oldArrNext = r14;

FloatRegister oldElem0 = v0;
FloatRegister oldElem1 = v1;
FloatRegister newElem = v2;
FloatRegister shiftVCount = v3;
FloatRegister shiftVRevCount = v4;

__ cbz(idx, Exit);

__ add(newArr, newArr, newIdx, Assembler::LSL, 2);

// left shift count
__ movw(shiftRevCount, 32);
__ subw(shiftRevCount, shiftRevCount, shiftCount);

// numIter too small to allow a 4-words SIMD loop, rolling back
__ cmp(numIter, (u1)4);
__ br(Assembler::LT, ShiftThree);

__ dup(shiftVCount, __ T4S, shiftCount);
__ dup(shiftVRevCount, __ T4S, shiftRevCount);
__ negr(shiftVCount, __ T4S, shiftVCount);

__ BIND(ShiftSIMDLoop);

// Calculate the load addresses
__ sub(idx, idx, 4);
__ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
__ add(newArrCur, newArr, idx, Assembler::LSL, 2);
__ add(oldArrCur, oldArrNext, 4);

// Load 4 words and process
__ ld1(oldElem0, __ T4S, Address(oldArrCur));
__ ld1(oldElem1, __ T4S, Address(oldArrNext));
__ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
__ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
__ orr(newElem, __ T16B, oldElem0, oldElem1);
__ st1(newElem, __ T4S, Address(newArrCur));

__ cmp(idx, (u1)4);
__ br(Assembler::LT, ShiftTwoLoop);
__ b(ShiftSIMDLoop);

__ BIND(ShiftTwoLoop);
__ cbz(idx, Exit);
__ cmp(idx, (u1)1);
__ br(Assembler::EQ, ShiftOne);

// Calculate the load addresses
__ sub(idx, idx, 2);
__ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
__ add(newArrCur, newArr, idx, Assembler::LSL, 2);
__ add(oldArrCur, oldArrNext, 4);

// Load 2 words and process
__ ld1(oldElem0, __ T2S, Address(oldArrCur));
__ ld1(oldElem1, __ T2S, Address(oldArrNext));
__ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
__ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
__ orr(newElem, __ T8B, oldElem0, oldElem1);
__ st1(newElem, __ T2S, Address(newArrCur));
__ b(ShiftTwoLoop);

__ BIND(ShiftThree);
__ tbz(idx, 1, ShiftOne);
__ tbz(idx, 0, ShiftTwo);
__ ldrw(r10, Address(oldArr, 12));
__ ldrw(r11, Address(oldArr, 8));
__ lsrvw(r10, r10, shiftCount);
__ lslvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, Address(newArr, 8));

__ BIND(ShiftTwo);
__ ldrw(r10, Address(oldArr, 8));
__ ldrw(r11, Address(oldArr, 4));
__ lsrvw(r10, r10, shiftCount);
__ lslvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, Address(newArr, 4));

__ BIND(ShiftOne);
__ ldrw(r10, Address(oldArr, 4));
__ ldrw(r11, Address(oldArr));
__ lsrvw(r10, r10, shiftCount);
__ lslvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, Address(newArr));

__ BIND(Exit);
__ ret(lr);

return start;
}

// Arguments:
//
// Input:
// c_rarg0 - newArr address
// c_rarg1 - oldArr address
// c_rarg2 - newIdx
// c_rarg3 - shiftCount
// c_rarg4 - numIter
//
address generate_bigIntegerLeftShift() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker");
address start = __ pc();

Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;

Register newArr = c_rarg0;
Register oldArr = c_rarg1;
Register newIdx = c_rarg2;
Register shiftCount = c_rarg3;
Register numIter = c_rarg4;

Register shiftRevCount = rscratch1;
Register oldArrNext = rscratch2;

FloatRegister oldElem0 = v0;
FloatRegister oldElem1 = v1;
FloatRegister newElem = v2;
FloatRegister shiftVCount = v3;
FloatRegister shiftVRevCount = v4;

__ cbz(numIter, Exit);

__ add(oldArrNext, oldArr, 4);
__ add(newArr, newArr, newIdx, Assembler::LSL, 2);

// right shift count
__ movw(shiftRevCount, 32);
__ subw(shiftRevCount, shiftRevCount, shiftCount);

// numIter too small to allow a 4-words SIMD loop, rolling back
__ cmp(numIter, (u1)4);
__ br(Assembler::LT, ShiftThree);

__ dup(shiftVCount, __ T4S, shiftCount);
__ dup(shiftVRevCount, __ T4S, shiftRevCount);
__ negr(shiftVRevCount, __ T4S, shiftVRevCount);

__ BIND(ShiftSIMDLoop);

// load 4 words and process
__ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
__ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
__ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
__ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
__ orr(newElem, __ T16B, oldElem0, oldElem1);
__ st1(newElem, __ T4S, __ post(newArr, 16));
__ sub(numIter, numIter, 4);

__ cmp(numIter, (u1)4);
__ br(Assembler::LT, ShiftTwoLoop);
__ b(ShiftSIMDLoop);

__ BIND(ShiftTwoLoop);
__ cbz(numIter, Exit);
__ cmp(numIter, (u1)1);
__ br(Assembler::EQ, ShiftOne);

// load 2 words and process
__ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
__ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
__ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
__ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
__ orr(newElem, __ T8B, oldElem0, oldElem1);
__ st1(newElem, __ T2S, __ post(newArr, 8));
__ sub(numIter, numIter, 2);
__ b(ShiftTwoLoop);

__ BIND(ShiftThree);
__ ldrw(r10, __ post(oldArr, 4));
__ ldrw(r11, __ post(oldArrNext, 4));
__ lslvw(r10, r10, shiftCount);
__ lsrvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, __ post(newArr, 4));
__ tbz(numIter, 1, Exit);
__ tbz(numIter, 0, ShiftOne);

__ BIND(ShiftTwo);
__ ldrw(r10, __ post(oldArr, 4));
__ ldrw(r11, __ post(oldArrNext, 4));
__ lslvw(r10, r10, shiftCount);
__ lsrvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, __ post(newArr, 4));

__ BIND(ShiftOne);
__ ldrw(r10, Address(oldArr));
__ ldrw(r11, Address(oldArrNext));
__ lslvw(r10, r10, shiftCount);
__ lsrvw(r11, r11, shiftRevCount);
__ orrw(r12, r10, r11);
__ strw(r12, Address(newArr));

__ BIND(Exit);
__ ret(lr);

return start;
}

void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
Expand Down Expand Up @@ -6224,6 +6456,11 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_mulAdd = generate_mulAdd();
}

if (UseSIMDForBigIntegerShiftIntrinsics) {
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
}

if (UseMontgomeryMultiplyIntrinsic) {
StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
Expand Down
38 changes: 36 additions & 2 deletions test/micro/org/openjdk/bench/java/math/BigIntegers.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -34,6 +34,7 @@
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.infra.Blackhole;

import java.math.BigInteger;
Expand All @@ -45,11 +46,14 @@
@State(Scope.Thread)
public class BigIntegers {

private BigInteger[] hugeArray, largeArray, smallArray, shiftArray;
private BigInteger[] hugeArray, largeArray, smallArray, shiftArray, smallShiftArray;
public String[] dummyStringArray;
public Object[] dummyArr;
private static final int TESTSIZE = 1000;

@Param({"32", "64", "96", "128", "160", "192", "224", "256"})
private int maxNumbits;

@Setup
public void setup() {
Random r = new Random(1123);
Expand All @@ -72,6 +76,9 @@ public void setup() {
* Each array entry is atmost 16k bits
* in size
*/
smallShiftArray = new BigInteger[TESTSIZE]; /*
* Small numbers, bits count in range [maxNumbits - 31, maxNumbits]
*/

dummyStringArray = new String[TESTSIZE];
dummyArr = new Object[TESTSIZE];
Expand All @@ -84,6 +91,7 @@ public void setup() {
largeArray[i] = new BigInteger("" + ((long) value + (long) Integer.MAX_VALUE));
smallArray[i] = new BigInteger("" + ((long) value / 1000));
shiftArray[i] = new BigInteger(numbits, r);
smallShiftArray[i] = new BigInteger(Math.max(maxNumbits - value % 32, 0), r);
}
}

Expand Down Expand Up @@ -177,4 +185,30 @@ public void testRightShift(Blackhole bh) {
}
bh.consume(tmp);
}

/** Invokes the shiftLeft method of small BigInteger with different values. */
@Benchmark
@OperationsPerInvocation(TESTSIZE)
public void testSmallLeftShift(Blackhole bh) {
Random rand = new Random();
int shift = rand.nextInt(30) + 1;
BigInteger tmp = null;
for (BigInteger s : smallShiftArray) {
tmp = s.shiftLeft(shift);
bh.consume(tmp);
}
}

/** Invokes the shiftRight method of small BigInteger with different values. */
@Benchmark
@OperationsPerInvocation(TESTSIZE)
public void testSmallRightShift(Blackhole bh) {
Random rand = new Random();
int shift = rand.nextInt(30) + 1;
BigInteger tmp = null;
for (BigInteger s : smallShiftArray) {
tmp = s.shiftRight(shift);
bh.consume(tmp);
}
}
}