Skip to content

Commit 8370514

Browse files
Hamlin Lipull[bot]
authored andcommitted
8315716: RISC-V: implement ChaCha20 intrinsic
Reviewed-by: luhenry, fyang
1 parent 1795840 commit 8370514

File tree

4 files changed

+163
-0
lines changed

4 files changed

+163
-0
lines changed

src/hotspot/cpu/riscv/assembler_riscv.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1790,6 +1790,11 @@ enum Nf {
17901790
INSN(vlse32_v, 0b0000111, 0b110, 0b10, 0b0);
17911791
INSN(vlse64_v, 0b0000111, 0b111, 0b10, 0b0);
17921792

1793+
INSN(vsse8_v, 0b0100111, 0b000, 0b10, 0b0);
1794+
INSN(vsse16_v, 0b0100111, 0b101, 0b10, 0b0);
1795+
INSN(vsse32_v, 0b0100111, 0b110, 0b10, 0b0);
1796+
INSN(vsse64_v, 0b0100111, 0b111, 0b10, 0b0);
1797+
17931798
#undef INSN
17941799
#undef patch_VLdSt
17951800

src/hotspot/cpu/riscv/macroAssembler_riscv.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,6 +1289,13 @@ class MacroAssembler: public Assembler {
12891289
}
12901290

12911291
// vector pseudo instructions
1292+
// rotate vector register left with shift bits, 32-bit version
1293+
inline void vrole32_vi(VectorRegister vd, uint32_t shift, VectorRegister tmp_vr) {
1294+
vsrl_vi(tmp_vr, vd, 32 - shift);
1295+
vsll_vi(vd, vd, shift);
1296+
vor_vv(vd, vd, tmp_vr);
1297+
}
1298+
12921299
inline void vl1r_v(VectorRegister vd, Register rs) {
12931300
vl1re8_v(vd, rs);
12941301
}

src/hotspot/cpu/riscv/stubGenerator_riscv.cpp

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4277,6 +4277,142 @@ class StubGenerator: public StubCodeGenerator {
42774277
return (address) start;
42784278
}
42794279

4280+
/**
4281+
* Perform the quarter round calculations on values contained within four vector registers.
4282+
*
4283+
* @param aVec the SIMD register containing only the "a" values
4284+
* @param bVec the SIMD register containing only the "b" values
4285+
* @param cVec the SIMD register containing only the "c" values
4286+
* @param dVec the SIMD register containing only the "d" values
4287+
* @param tmp_vr temporary vector register holds intermedia values.
4288+
*/
4289+
void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
4290+
VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
4291+
// a += b, d ^= a, d <<<= 16
4292+
__ vadd_vv(aVec, aVec, bVec);
4293+
__ vxor_vv(dVec, dVec, aVec);
4294+
__ vrole32_vi(dVec, 16, tmp_vr);
4295+
4296+
// c += d, b ^= c, b <<<= 12
4297+
__ vadd_vv(cVec, cVec, dVec);
4298+
__ vxor_vv(bVec, bVec, cVec);
4299+
__ vrole32_vi(bVec, 12, tmp_vr);
4300+
4301+
// a += b, d ^= a, d <<<= 8
4302+
__ vadd_vv(aVec, aVec, bVec);
4303+
__ vxor_vv(dVec, dVec, aVec);
4304+
__ vrole32_vi(dVec, 8, tmp_vr);
4305+
4306+
// c += d, b ^= c, b <<<= 7
4307+
__ vadd_vv(cVec, cVec, dVec);
4308+
__ vxor_vv(bVec, bVec, cVec);
4309+
__ vrole32_vi(bVec, 7, tmp_vr);
4310+
}
4311+
4312+
/**
4313+
* int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
4314+
*
4315+
* Input arguments:
4316+
* c_rarg0 - state, the starting state
4317+
* c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function
4318+
*
4319+
* Implementation Note:
4320+
* Parallelization is achieved by loading individual state elements into vectors for N blocks.
4321+
* N depends on single vector register length.
4322+
*/
4323+
address generate_chacha20Block() {
4324+
Label L_Rounds;
4325+
4326+
__ align(CodeEntryAlignment);
4327+
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
4328+
address start = __ pc();
4329+
__ enter();
4330+
4331+
const int states_len = 16;
4332+
const int step = 4;
4333+
const Register state = c_rarg0;
4334+
const Register key_stream = c_rarg1;
4335+
const Register tmp_addr = t0;
4336+
const Register length = t1;
4337+
4338+
// Organize vector registers in an array that facilitates
4339+
// putting repetitive opcodes into loop structures below.
4340+
const VectorRegister work_vrs[16] = {
4341+
v0, v1, v2, v3, v4, v5, v6, v7,
4342+
v8, v9, v10, v11, v12, v13, v14, v15
4343+
};
4344+
const VectorRegister tmp_vr = v16;
4345+
const VectorRegister counter_vr = v17;
4346+
4347+
{
4348+
// Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
4349+
// in java level.
4350+
__ vsetivli(length, 16, Assembler::e32, Assembler::m1);
4351+
}
4352+
4353+
// Load from source state.
4354+
// Every element in source state is duplicated to all elements in the corresponding vector.
4355+
__ mv(tmp_addr, state);
4356+
for (int i = 0; i < states_len; i += 1) {
4357+
__ vlse32_v(work_vrs[i], tmp_addr, zr);
4358+
__ addi(tmp_addr, tmp_addr, step);
4359+
}
4360+
// Adjust counter for every individual block.
4361+
__ vid_v(counter_vr);
4362+
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4363+
4364+
// Perform 10 iterations of the 8 quarter round set
4365+
{
4366+
const Register loop = t2; // share t2 with other non-overlapping usages.
4367+
__ mv(loop, 10);
4368+
__ BIND(L_Rounds);
4369+
4370+
chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr);
4371+
chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr);
4372+
chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
4373+
chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
4374+
4375+
chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
4376+
chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
4377+
chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr);
4378+
chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr);
4379+
4380+
__ sub(loop, loop, 1);
4381+
__ bnez(loop, L_Rounds);
4382+
}
4383+
4384+
// Add the original state into the end working state.
4385+
// We do this by first duplicating every element in source state array to the corresponding
4386+
// vector, then adding it to the post-loop working state.
4387+
__ mv(tmp_addr, state);
4388+
for (int i = 0; i < states_len; i += 1) {
4389+
__ vlse32_v(tmp_vr, tmp_addr, zr);
4390+
__ addi(tmp_addr, tmp_addr, step);
4391+
__ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
4392+
}
4393+
// Add the counter overlay onto work_vrs[12] at the end.
4394+
__ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
4395+
4396+
// Store result to key stream.
4397+
{
4398+
const Register stride = t2; // share t2 with other non-overlapping usages.
4399+
// Every block occupies 64 bytes, so we use 64 as stride of the vector store.
4400+
__ mv(stride, 64);
4401+
for (int i = 0; i < states_len; i += 1) {
4402+
__ vsse32_v(work_vrs[i], key_stream, stride);
4403+
__ addi(key_stream, key_stream, step);
4404+
}
4405+
}
4406+
4407+
// Return length of output key_stream
4408+
__ slli(c_rarg0, length, 6);
4409+
4410+
__ leave();
4411+
__ ret();
4412+
4413+
return (address) start;
4414+
}
4415+
42804416
#if INCLUDE_JFR
42814417

42824418
static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
@@ -4496,6 +4632,11 @@ class StubGenerator: public StubCodeGenerator {
44964632
StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress");
44974633
StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB");
44984634
}
4635+
4636+
if (UseChaCha20Intrinsics) {
4637+
StubRoutines::_chacha20Block = generate_chacha20Block();
4638+
}
4639+
44994640
#endif // COMPILER2_OR_JVMCI
45004641
}
45014642

src/hotspot/cpu/riscv/vm_version_riscv.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,16 @@ void VM_Version::initialize() {
253253
warning("Block zeroing is not available");
254254
FLAG_SET_DEFAULT(UseBlockZeroing, false);
255255
}
256+
if (UseRVV) {
257+
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
258+
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true);
259+
}
260+
} else if (UseChaCha20Intrinsics) {
261+
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
262+
warning("Chacha20 intrinsic requires RVV instructions (not available on this CPU)");
263+
}
264+
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
265+
}
256266

257267
#ifdef COMPILER2
258268
c2_initialize();

0 commit comments

Comments
 (0)