@@ -4277,6 +4277,142 @@ class StubGenerator: public StubCodeGenerator {
42774277    return  (address) start;
42784278  }
42794279
4280+   /* *
4281+    * Perform the quarter round calculations on values contained within four vector registers. 
4282+    * 
4283+    * @param aVec the SIMD register containing only the "a" values 
4284+    * @param bVec the SIMD register containing only the "b" values 
4285+    * @param cVec the SIMD register containing only the "c" values 
4286+    * @param dVec the SIMD register containing only the "d" values 
4287+    * @param tmp_vr temporary vector register holds intermedia values. 
4288+    */  
4289+   void  chacha20_quarter_round (VectorRegister aVec, VectorRegister bVec,
4290+                           VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
4291+     //  a += b, d ^= a, d <<<= 16
4292+     __ vadd_vv (aVec, aVec, bVec);
4293+     __ vxor_vv (dVec, dVec, aVec);
4294+     __ vrole32_vi (dVec, 16 , tmp_vr);
4295+ 
4296+     //  c += d, b ^= c, b <<<= 12
4297+     __ vadd_vv (cVec, cVec, dVec);
4298+     __ vxor_vv (bVec, bVec, cVec);
4299+     __ vrole32_vi (bVec, 12 , tmp_vr);
4300+ 
4301+     //  a += b, d ^= a, d <<<= 8
4302+     __ vadd_vv (aVec, aVec, bVec);
4303+     __ vxor_vv (dVec, dVec, aVec);
4304+     __ vrole32_vi (dVec, 8 , tmp_vr);
4305+ 
4306+     //  c += d, b ^= c, b <<<= 7
4307+     __ vadd_vv (cVec, cVec, dVec);
4308+     __ vxor_vv (bVec, bVec, cVec);
4309+     __ vrole32_vi (bVec, 7 , tmp_vr);
4310+   }
4311+ 
4312+   /* *
4313+    * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result) 
4314+    * 
4315+    *  Input arguments: 
4316+    *  c_rarg0   - state, the starting state 
4317+    *  c_rarg1   - key_stream, the array that will hold the result of the ChaCha20 block function 
4318+    * 
4319+    *  Implementation Note: 
4320+    *   Parallelization is achieved by loading individual state elements into vectors for N blocks. 
4321+    *   N depends on single vector register length. 
4322+    */  
4323+   address generate_chacha20Block () {
4324+     Label L_Rounds;
4325+ 
4326+     __ align (CodeEntryAlignment);
4327+     StubCodeMark mark (this , " StubRoutines" " chacha20Block" 
4328+     address start = __ pc ();
4329+     __ enter ();
4330+ 
4331+     const  int  states_len = 16 ;
4332+     const  int  step = 4 ;
4333+     const  Register state = c_rarg0;
4334+     const  Register key_stream = c_rarg1;
4335+     const  Register tmp_addr = t0;
4336+     const  Register length = t1;
4337+ 
4338+     //  Organize vector registers in an array that facilitates
4339+     //  putting repetitive opcodes into loop structures below.
4340+     const  VectorRegister work_vrs[16 ] = {
4341+       v0, v1, v2,  v3,  v4,  v5,  v6,  v7,
4342+       v8, v9, v10, v11, v12, v13, v14, v15
4343+     };
4344+     const  VectorRegister tmp_vr = v16;
4345+     const  VectorRegister counter_vr = v17;
4346+ 
4347+     {
4348+       //  Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
4349+       //  in java level.
4350+       __ vsetivli (length, 16 , Assembler::e32 , Assembler::m1);
4351+     }
4352+ 
4353+     //  Load from source state.
4354+     //  Every element in source state is duplicated to all elements in the corresponding vector.
4355+     __ mv (tmp_addr, state);
4356+     for  (int  i = 0 ; i < states_len; i += 1 ) {
4357+       __ vlse32_v (work_vrs[i], tmp_addr, zr);
4358+       __ addi (tmp_addr, tmp_addr, step);
4359+     }
4360+     //  Adjust counter for every individual block.
4361+     __ vid_v (counter_vr);
4362+     __ vadd_vv (work_vrs[12 ], work_vrs[12 ], counter_vr);
4363+ 
4364+     //  Perform 10 iterations of the 8 quarter round set
4365+     {
4366+       const  Register loop = t2; //  share t2 with other non-overlapping usages.
4367+       __ mv (loop, 10 );
4368+       __ BIND (L_Rounds);
4369+ 
4370+       chacha20_quarter_round (work_vrs[0 ], work_vrs[4 ], work_vrs[8 ],  work_vrs[12 ], tmp_vr);
4371+       chacha20_quarter_round (work_vrs[1 ], work_vrs[5 ], work_vrs[9 ],  work_vrs[13 ], tmp_vr);
4372+       chacha20_quarter_round (work_vrs[2 ], work_vrs[6 ], work_vrs[10 ], work_vrs[14 ], tmp_vr);
4373+       chacha20_quarter_round (work_vrs[3 ], work_vrs[7 ], work_vrs[11 ], work_vrs[15 ], tmp_vr);
4374+ 
4375+       chacha20_quarter_round (work_vrs[0 ], work_vrs[5 ], work_vrs[10 ], work_vrs[15 ], tmp_vr);
4376+       chacha20_quarter_round (work_vrs[1 ], work_vrs[6 ], work_vrs[11 ], work_vrs[12 ], tmp_vr);
4377+       chacha20_quarter_round (work_vrs[2 ], work_vrs[7 ], work_vrs[8 ],  work_vrs[13 ], tmp_vr);
4378+       chacha20_quarter_round (work_vrs[3 ], work_vrs[4 ], work_vrs[9 ],  work_vrs[14 ], tmp_vr);
4379+ 
4380+       __ sub (loop, loop, 1 );
4381+       __ bnez (loop, L_Rounds);
4382+     }
4383+ 
4384+     //  Add the original state into the end working state.
4385+     //  We do this by first duplicating every element in source state array to the corresponding
4386+     //  vector, then adding it to the post-loop working state.
4387+     __ mv (tmp_addr, state);
4388+     for  (int  i = 0 ; i < states_len; i += 1 ) {
4389+       __ vlse32_v (tmp_vr, tmp_addr, zr);
4390+       __ addi (tmp_addr, tmp_addr, step);
4391+       __ vadd_vv (work_vrs[i], work_vrs[i], tmp_vr);
4392+     }
4393+     //  Add the counter overlay onto work_vrs[12] at the end.
4394+     __ vadd_vv (work_vrs[12 ], work_vrs[12 ], counter_vr);
4395+ 
4396+     //  Store result to key stream.
4397+     {
4398+       const  Register stride = t2; //  share t2 with other non-overlapping usages.
4399+       //  Every block occupies 64 bytes, so we use 64 as stride of the vector store.
4400+       __ mv (stride, 64 );
4401+       for  (int  i = 0 ; i < states_len; i += 1 ) {
4402+         __ vsse32_v (work_vrs[i], key_stream, stride);
4403+         __ addi (key_stream, key_stream, step);
4404+       }
4405+     }
4406+ 
4407+     //  Return length of output key_stream
4408+     __ slli (c_rarg0, length, 6 );
4409+ 
4410+     __ leave ();
4411+     __ ret ();
4412+ 
4413+     return  (address) start;
4414+   }
4415+ 
42804416#if  INCLUDE_JFR
42814417
42824418  static  void  jfr_prologue (address the_pc, MacroAssembler* _masm, Register thread) {
@@ -4496,6 +4632,11 @@ class StubGenerator: public StubCodeGenerator {
44964632      StubRoutines::_md5_implCompress   = generate_md5_implCompress (false , " md5_implCompress" 
44974633      StubRoutines::_md5_implCompressMB = generate_md5_implCompress (true ,  " md5_implCompressMB" 
44984634    }
4635+ 
4636+     if  (UseChaCha20Intrinsics) {
4637+       StubRoutines::_chacha20Block = generate_chacha20Block ();
4638+     }
4639+ 
44994640#endif  //  COMPILER2_OR_JVMCI
45004641  }
45014642
0 commit comments