Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 88 additions & 1 deletion src/hotspot/cpu/x86/stubGenerator_x86_64_kyber.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,39 @@ static address kyberAvx512ConstsAddr(int offset) {

const Register scratch = r10;

ATTRIBUTE_ALIGNED(64) static const uint8_t kyberAvx512_12To16Swap[] = {
// 0 - 63
0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16,
16, 17, 18, 19, 19, 20, 21, 22, 22, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30,
31, 31, 32, 33, 34, 34, 35, 36, 37, 37, 38, 39, 40, 40, 41, 42, 43, 43, 44,
45, 46, 46, 47
};

static address kyberAvx512_12To16SwapAddr() {
return (address) kyberAvx512_12To16Swap;
}

ATTRIBUTE_ALIGNED(64) static const uint16_t kyberAvx512_12To16Shift[] = {
// 0 - 31
0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0,
4, 0, 4, 0, 4, 0, 4
};

static address kyberAvx512_12To16ShiftAddr() {
return (address) kyberAvx512_12To16Shift;
}

ATTRIBUTE_ALIGNED(64) static const uint64_t kyberAvx512_12To16And[] = {
// 0 - 7
0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF,
0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF,
0x0FFF0FFF0FFF0FFF, 0x0FFF0FFF0FFF0FFF
};

static address kyberAvx512_12To16AndAddr() {
return (address) kyberAvx512_12To16And;
}

ATTRIBUTE_ALIGNED(64) static const uint16_t kyberAvx512NttPerms[] = {
// 0
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
Expand Down Expand Up @@ -822,10 +855,64 @@ address generate_kyber12To16_avx512(StubGenerator *stubgen,

const Register perms = r11;

Label Loop;
Label Loop, VBMILoop;

__ addptr(condensed, condensedOffs);

if (VM_Version::supports_avx512_vbmi2()) {
// mask load for the first 48 bytes of each vector
__ mov64(rax, 0x0000FFFFFFFFFFFF);
__ kmovql(k1, rax);

__ lea(perms, ExternalAddress(kyberAvx512_12To16SwapAddr()));
__ evmovdqub(xmm20, Address(perms), Assembler::AVX_512bit);

__ lea(perms, ExternalAddress(kyberAvx512_12To16ShiftAddr()));
__ evmovdquw(xmm21, Address(perms), Assembler::AVX_512bit);

__ lea(perms, ExternalAddress(kyberAvx512_12To16AndAddr()));
__ evmovdquq(xmm22, Address(perms), Assembler::AVX_512bit);

__ BIND(VBMILoop);

__ evmovdqub(xmm0, k1, Address(condensed, 0), false,
Assembler::AVX_512bit);
__ evmovdqub(xmm1, k1, Address(condensed, 48), false,
Assembler::AVX_512bit);
__ evmovdqub(xmm2, k1, Address(condensed, 96), false,
Assembler::AVX_512bit);
__ evmovdqub(xmm3, k1, Address(condensed, 144), false,
Assembler::AVX_512bit);

__ evpermb(xmm4, k0, xmm20, xmm0, false, Assembler::AVX_512bit);
__ evpermb(xmm5, k0, xmm20, xmm1, false, Assembler::AVX_512bit);
__ evpermb(xmm6, k0, xmm20, xmm2, false, Assembler::AVX_512bit);
__ evpermb(xmm7, k0, xmm20, xmm3, false, Assembler::AVX_512bit);

__ evpsrlvw(xmm4, xmm4, xmm21, Assembler::AVX_512bit);
__ evpsrlvw(xmm5, xmm5, xmm21, Assembler::AVX_512bit);
__ evpsrlvw(xmm6, xmm6, xmm21, Assembler::AVX_512bit);
__ evpsrlvw(xmm7, xmm7, xmm21, Assembler::AVX_512bit);

__ evpandq(xmm0, xmm22, xmm4, Assembler::AVX_512bit);
__ evpandq(xmm1, xmm22, xmm5, Assembler::AVX_512bit);
__ evpandq(xmm2, xmm22, xmm6, Assembler::AVX_512bit);
__ evpandq(xmm3, xmm22, xmm7, Assembler::AVX_512bit);

store4regs(parsed, 0, xmm0_3, _masm);

__ addptr(condensed, 192);
__ addptr(parsed, 256);
__ subl(parsedLength, 128);
__ jcc(Assembler::greater, VBMILoop);

__ leave(); // required for proper stackwalking of RuntimeStub frame
__ mov64(rax, 0); // return 0
__ ret(0);

return start;
}

__ lea(perms, ExternalAddress(kyberAvx512_12To16PermsAddr()));

load4regs(xmm24_27, perms, 0, _masm);
Expand Down