8315716: RISC-V: implement ChaCha20 intrinsic

Hamlin Li · pull[bot] · commit 8370514bc60c · 2024-04-17T00:51:13.000Z
Reviewed-by: luhenry, fyang
diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp
@@ -1790,6 +1790,11 @@ enum Nf {
   INSN(vlse32_v, 0b0000111, 0b110, 0b10, 0b0);
   INSN(vlse64_v, 0b0000111, 0b111, 0b10, 0b0);
 
+  INSN(vsse8_v,  0b0100111, 0b000, 0b10, 0b0);
+  INSN(vsse16_v, 0b0100111, 0b101, 0b10, 0b0);
+  INSN(vsse32_v, 0b0100111, 0b110, 0b10, 0b0);
+  INSN(vsse64_v, 0b0100111, 0b111, 0b10, 0b0);
+
 #undef INSN
 #undef patch_VLdSt
 
diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp
@@ -1289,6 +1289,13 @@ class MacroAssembler: public Assembler {
   }
 
   // vector pseudo instructions
+  // rotate vector register left with shift bits, 32-bit version
+  inline void vrole32_vi(VectorRegister vd, uint32_t shift, VectorRegister tmp_vr) {
+    vsrl_vi(tmp_vr, vd, 32 - shift);
+    vsll_vi(vd, vd, shift);
+    vor_vv(vd, vd, tmp_vr);
+  }
+
   inline void vl1r_v(VectorRegister vd, Register rs) {
     vl1re8_v(vd, rs);
   }
diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
@@ -4277,6 +4277,142 @@ class StubGenerator: public StubCodeGenerator {
     return (address) start;
   }
 
+  /**
+   * Perform the quarter round calculations on values contained within four vector registers.
+   *
+   * @param aVec the SIMD register containing only the "a" values
+   * @param bVec the SIMD register containing only the "b" values
+   * @param cVec the SIMD register containing only the "c" values
+   * @param dVec the SIMD register containing only the "d" values
+   * @param tmp_vr temporary vector register holds intermedia values.
+   */
+  void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
+                          VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
+    // a += b, d ^= a, d <<<= 16
+    __ vadd_vv(aVec, aVec, bVec);
+    __ vxor_vv(dVec, dVec, aVec);
+    __ vrole32_vi(dVec, 16, tmp_vr);
+
+    // c += d, b ^= c, b <<<= 12
+    __ vadd_vv(cVec, cVec, dVec);
+    __ vxor_vv(bVec, bVec, cVec);
+    __ vrole32_vi(bVec, 12, tmp_vr);
+
+    // a += b, d ^= a, d <<<= 8
+    __ vadd_vv(aVec, aVec, bVec);
+    __ vxor_vv(dVec, dVec, aVec);
+    __ vrole32_vi(dVec, 8, tmp_vr);
+
+    // c += d, b ^= c, b <<<= 7
+    __ vadd_vv(cVec, cVec, dVec);
+    __ vxor_vv(bVec, bVec, cVec);
+    __ vrole32_vi(bVec, 7, tmp_vr);
+  }
+
+  /**
+   * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
+   *
+   *  Input arguments:
+   *  c_rarg0   - state, the starting state
+   *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function
+   *
+   *  Implementation Note:
+   *   Parallelization is achieved by loading individual state elements into vectors for N blocks.
+   *   N depends on single vector register length.
+   */
+  address generate_chacha20Block() {
+    Label L_Rounds;
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "chacha20Block");
+    address start = __ pc();
+    __ enter();
+
+    const int states_len = 16;
+    const int step = 4;
+    const Register state = c_rarg0;
+    const Register key_stream = c_rarg1;
+    const Register tmp_addr = t0;
+    const Register length = t1;
+
+    // Organize vector registers in an array that facilitates
+    // putting repetitive opcodes into loop structures below.
+    const VectorRegister work_vrs[16] = {
+      v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
+      v8, v9, v10, v11, v12, v13, v14, v15
+    };
+    const VectorRegister tmp_vr = v16;
+    const VectorRegister counter_vr = v17;
+
+    {
+      // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
+      // in java level.
+      __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
+    }
+
+    // Load from source state.
+    // Every element in source state is duplicated to all elements in the corresponding vector.
+    __ mv(tmp_addr, state);
+    for (int i = 0; i < states_len; i += 1) {
+      __ vlse32_v(work_vrs[i], tmp_addr, zr);
+      __ addi(tmp_addr, tmp_addr, step);
+    }
+    // Adjust counter for every individual block.
+    __ vid_v(counter_vr);
+    __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
+
+    // Perform 10 iterations of the 8 quarter round set
+    {
+      const Register loop = t2; // share t2 with other non-overlapping usages.
+      __ mv(loop, 10);
+      __ BIND(L_Rounds);
+
+      chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8],  work_vrs[12], tmp_vr);
+      chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9],  work_vrs[13], tmp_vr);
+      chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
+      chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
+
+      chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
+      chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
+      chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8],  work_vrs[13], tmp_vr);
+      chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9],  work_vrs[14], tmp_vr);
+
+      __ sub(loop, loop, 1);
+      __ bnez(loop, L_Rounds);
+    }
+
+    // Add the original state into the end working state.
+    // We do this by first duplicating every element in source state array to the corresponding
+    // vector, then adding it to the post-loop working state.
+    __ mv(tmp_addr, state);
+    for (int i = 0; i < states_len; i += 1) {
+      __ vlse32_v(tmp_vr, tmp_addr, zr);
+      __ addi(tmp_addr, tmp_addr, step);
+      __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
+    }
+    // Add the counter overlay onto work_vrs[12] at the end.
+    __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
+
+    // Store result to key stream.
+    {
+      const Register stride = t2; // share t2 with other non-overlapping usages.
+      // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
+      __ mv(stride, 64);
+      for (int i = 0; i < states_len; i += 1) {
+        __ vsse32_v(work_vrs[i], key_stream, stride);
+        __ addi(key_stream, key_stream, step);
+      }
+    }
+
+    // Return length of output key_stream
+    __ slli(c_rarg0, length, 6);
+
+    __ leave();
+    __ ret();
+
+    return (address) start;
+  }
+
 #if INCLUDE_JFR
 
   static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) {
@@ -4496,6 +4632,11 @@ class StubGenerator: public StubCodeGenerator {
       StubRoutines::_md5_implCompress   = generate_md5_implCompress(false, "md5_implCompress");
       StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true,  "md5_implCompressMB");
     }
+
+    if (UseChaCha20Intrinsics) {
+      StubRoutines::_chacha20Block = generate_chacha20Block();
+    }
+
 #endif // COMPILER2_OR_JVMCI
   }
 
diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp
@@ -253,6 +253,16 @@ void VM_Version::initialize() {
     warning("Block zeroing is not available");
     FLAG_SET_DEFAULT(UseBlockZeroing, false);
   }
+  if (UseRVV) {
+    if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
+      FLAG_SET_DEFAULT(UseChaCha20Intrinsics, true);
+    }
+  } else if (UseChaCha20Intrinsics) {
+    if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
+      warning("Chacha20 intrinsic requires RVV instructions (not available on this CPU)");
+    }
+    FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
+  }
 
 #ifdef COMPILER2
   c2_initialize();

Original file line number	Diff line number	Diff line change
`@@ -1289,6 +1289,13 @@ class MacroAssembler: public Assembler {`
`1289`	`1289`	`}`
`1290`	`1290`
`1291`	`1291`	`// vector pseudo instructions`
	`1292`	`+ // rotate vector register left with shift bits, 32-bit version`
	`1293`	`+ inline void vrole32_vi(VectorRegister vd, uint32_t shift, VectorRegister tmp_vr) {`
	`1294`	`+ vsrl_vi(tmp_vr, vd, 32 - shift);`
	`1295`	`+ vsll_vi(vd, vd, shift);`
	`1296`	`+ vor_vv(vd, vd, tmp_vr);`
	`1297`	`+ }`
	`1298`	`+`
`1292`	`1299`	`inline void vl1r_v(VectorRegister vd, Register rs) {`
`1293`	`1300`	`vl1re8_v(vd, rs);`
`1294`	`1301`	`}`