diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_kyber.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_kyber.cpp index 3e5593322d5ea..cec8431e44adc 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_kyber.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_kyber.cpp @@ -64,6 +64,39 @@ static address kyberAvx512ConstsAddr(int offset) { const Register scratch = r10; +ATTRIBUTE_ALIGNED(64) static const uint8_t kyberAvx512_12To16Swap[] = { +// 0 - 63 + 0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16, + 16, 17, 18, 19, 19, 20, 21, 22, 22, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, + 31, 31, 32, 33, 34, 34, 35, 36, 37, 37, 38, 39, 40, 40, 41, 42, 43, 43, 44, + 45, 46, 46, 47 + }; + +static address kyberAvx512_12To16SwapAddr() { + return (address) kyberAvx512_12To16Swap; +} + +ATTRIBUTE_ALIGNED(64) static const uint16_t kyberAvx512_12To16Shift[] = { +// 0 - 31 + 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, + 4, 0, 4, 0, 4, 0, 4 + }; + +static address kyberAvx512_12To16ShiftAddr() { + return (address) kyberAvx512_12To16Shift; +} + +ATTRIBUTE_ALIGNED(64) static const uint64_t kyberAvx512_12To16And[] = { +// 0 - 7 + 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, + 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, + 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF + }; + +static address kyberAvx512_12To16AndAddr() { + return (address) kyberAvx512_12To16And; +} + ATTRIBUTE_ALIGNED(64) static const uint16_t kyberAvx512NttPerms[] = { // 0 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, @@ -822,10 +855,64 @@ address generate_kyber12To16_avx512(StubGenerator *stubgen, const Register perms = r11; - Label Loop; + Label Loop, VBMILoop; __ addptr(condensed, condensedOffs); + if (VM_Version::supports_avx512_vbmi2()) { + // mask load for the first 48 bytes of each vector + __ mov64(rax, 0x0000FFFFFFFFFFFF); + __ kmovql(k1, rax); + + __ lea(perms, ExternalAddress(kyberAvx512_12To16SwapAddr())); + __ evmovdqub(xmm20, Address(perms), Assembler::AVX_512bit); + + __ lea(perms, ExternalAddress(kyberAvx512_12To16ShiftAddr())); + __ evmovdquw(xmm21, Address(perms), Assembler::AVX_512bit); + + __ lea(perms, ExternalAddress(kyberAvx512_12To16AndAddr())); + __ evmovdquq(xmm22, Address(perms), Assembler::AVX_512bit); + + __ BIND(VBMILoop); + + __ evmovdqub(xmm0, k1, Address(condensed, 0), false, + Assembler::AVX_512bit); + __ evmovdqub(xmm1, k1, Address(condensed, 48), false, + Assembler::AVX_512bit); + __ evmovdqub(xmm2, k1, Address(condensed, 96), false, + Assembler::AVX_512bit); + __ evmovdqub(xmm3, k1, Address(condensed, 144), false, + Assembler::AVX_512bit); + + __ evpermb(xmm4, k0, xmm20, xmm0, false, Assembler::AVX_512bit); + __ evpermb(xmm5, k0, xmm20, xmm1, false, Assembler::AVX_512bit); + __ evpermb(xmm6, k0, xmm20, xmm2, false, Assembler::AVX_512bit); + __ evpermb(xmm7, k0, xmm20, xmm3, false, Assembler::AVX_512bit); + + __ evpsrlvw(xmm4, xmm4, xmm21, Assembler::AVX_512bit); + __ evpsrlvw(xmm5, xmm5, xmm21, Assembler::AVX_512bit); + __ evpsrlvw(xmm6, xmm6, xmm21, Assembler::AVX_512bit); + __ evpsrlvw(xmm7, xmm7, xmm21, Assembler::AVX_512bit); + + __ evpandq(xmm0, xmm22, xmm4, Assembler::AVX_512bit); + __ evpandq(xmm1, xmm22, xmm5, Assembler::AVX_512bit); + __ evpandq(xmm2, xmm22, xmm6, Assembler::AVX_512bit); + __ evpandq(xmm3, xmm22, xmm7, Assembler::AVX_512bit); + + store4regs(parsed, 0, xmm0_3, _masm); + + __ addptr(condensed, 192); + __ addptr(parsed, 256); + __ subl(parsedLength, 128); + __ jcc(Assembler::greater, VBMILoop); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ mov64(rax, 0); // return 0 + __ ret(0); + + return start; + } + __ lea(perms, ExternalAddress(kyberAvx512_12To16PermsAddr())); load4regs(xmm24_27, perms, 0, _masm);