From 08f71cee951cfdda6b056165e0491b686a2b05bf Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 15:45:43 +0900 Subject: [PATCH] vpdpw[su,us,uu]d[,s] support avx10.2 --- doc/usage.md | 2 +- gen/gen_avx512.cpp | 14 +++++--- gen/gen_code.cpp | 14 ++++---- test/avx10/misc.txt | 74 ++++++++++++++++++++++++++++++++++++++++++ xbyak/xbyak_mnemonic.h | 12 +++---- 5 files changed, 99 insertions(+), 17 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 9398755c..5b255130 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -136,7 +136,7 @@ param|vnniEnc|avx10Enc EvexEncoding|AVX512-VNNI|AVX10.2 VexEncoding|AVX-VNNI|AVX-VNNI-INT8 default|EvexEncoding|VexEncoding -mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds +mnemonic|vpdpbusd, vpdpbusds, vpdpwssd, vpdpwssds|vmpsadbw, vpdpbssd, vpdpbssds, vpdpbsud, vpdpbsuds, vpdpbuud, vpdpbuuds, vpdpwsud vpdpwsuds vpdpwusd vpdpwusds vpdpwuud, vpdpwuuds ### Remark * `k1`, ..., `k7` are opmask registers. diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index ed7440c2..2b8a3286 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -467,16 +467,22 @@ void putX_X_XM_IMM_AVX10() int sel; bool hasIMM; } tbl[] = { + // vpdpb[su,uu,ss]d[,s] { 0x50, "vpdpbssd", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbssds", T_F2|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x50, "vpdpbsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x50, "vpdpbuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, { 0x51, "vpdpbuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, -#if 0 - { 0x50, "vpdpbuud", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, - { 0x51, "vpdpbuuds", T_MUST_EVEX | T_YMM | T_0F38 | T_EW0 | T_B32, false }, -#endif + + // vpdpw[su,us,uu]d[,s] + { 0xD2, "vpdpwsud", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwsuds", T_F3|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD2, "vpdpwusd", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwusds", T_66|T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD2, "vpdpwuud", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0xD3, "vpdpwuuds", T_0F38|T_YMM, T_W0, T_EW0|T_B32, 1, false }, + { 0x42, "vmpsadbw", T_0F3A|T_YMM, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index a71d416c..a22c12b2 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1901,6 +1901,7 @@ void put() } // avx-vnni-int8 // avx-vnni-int16 +#if 0 { const struct Tbl { uint8_t code; @@ -1914,12 +1915,12 @@ void put() // { 0x50, "vpdpbuud", T_0F38 | T_W0 | T_YMM }, // { 0x51, "vpdpbuuds", T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, - { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, - { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwsud", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwsuds", T_F3 | T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwusd", T_66 | T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwusds", T_66 | T_0F38 | T_W0 | T_YMM }, +// { 0xD2, "vpdpwuud", T_0F38 | T_W0 | T_YMM }, +// { 0xD3, "vpdpwuuds", T_0F38 | T_W0 | T_YMM }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -1927,6 +1928,7 @@ void put() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n", p->name, s.c_str(), p->code); } } +#endif } void put32() diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index 380e9a94..9464d034 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -91,3 +91,77 @@ vpdpbuuds(ym1, ym2, ptr_b[rax+128]); vpdpbuuds(zm1, zm2, zm3); vpdpbuuds(zm1, zm2, ptr[rax+128]); vpdpbuuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpwsud(xm1, xm2, xm3); +vpdpwsud(xm1, xm2, ptr[rax+128]); +vpdpwsud(xm1, xm2, ptr_b[rax+128]); + +vpdpwsud(ym1, ym2, ym3); +vpdpwsud(ym1, ym2, ptr[rax+128]); +vpdpwsud(ym1, ym2, ptr_b[rax+128]); + +vpdpwsud(zm1, zm2, zm3); +vpdpwsud(zm1, zm2, ptr[rax+128]); +vpdpwsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsuds(xm1, xm2, xm3); +vpdpwsuds(xm1, xm2, ptr[rax+128]); +vpdpwsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwsuds(ym1, ym2, ym3); +vpdpwsuds(ym1, ym2, ptr[rax+128]); +vpdpwsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwsuds(zm1, zm2, zm3); +vpdpwsuds(zm1, zm2, ptr[rax+128]); +vpdpwsuds(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsud(xm1, xm2, xm3); +vpdpwsud(xm1, xm2, ptr[rax+128]); +vpdpwsud(xm1, xm2, ptr_b[rax+128]); + +vpdpwsud(ym1, ym2, ym3); +vpdpwsud(ym1, ym2, ptr[rax+128]); +vpdpwsud(ym1, ym2, ptr_b[rax+128]); + +vpdpwsud(zm1, zm2, zm3); +vpdpwsud(zm1, zm2, ptr[rax+128]); +vpdpwsud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwsuds(xm1, xm2, xm3); +vpdpwsuds(xm1, xm2, ptr[rax+128]); +vpdpwsuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwsuds(ym1, ym2, ym3); +vpdpwsuds(ym1, ym2, ptr[rax+128]); +vpdpwsuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwsuds(zm1, zm2, zm3); +vpdpwsuds(zm1, zm2, ptr[rax+128]); +vpdpwsuds(zm1, zm2, ptr_b[rax+128]); + +// +vpdpwuud(xm1, xm2, xm3); +vpdpwuud(xm1, xm2, ptr[rax+128]); +vpdpwuud(xm1, xm2, ptr_b[rax+128]); + +vpdpwuud(ym1, ym2, ym3); +vpdpwuud(ym1, ym2, ptr[rax+128]); +vpdpwuud(ym1, ym2, ptr_b[rax+128]); + +vpdpwuud(zm1, zm2, zm3); +vpdpwuud(zm1, zm2, ptr[rax+128]); +vpdpwuud(zm1, zm2, ptr_b[rax+128]); +// +vpdpwuuds(xm1, xm2, xm3); +vpdpwuuds(xm1, xm2, ptr[rax+128]); +vpdpwuuds(xm1, xm2, ptr_b[rax+128]); + +vpdpwuuds(ym1, ym2, ym3); +vpdpwuuds(ym1, ym2, ptr[rax+128]); +vpdpwuuds(ym1, ym2, ptr_b[rax+128]); + +vpdpwuuds(zm1, zm2, zm3); +vpdpwuuds(zm1, zm2, ptr[rax+128]); +vpdpwuuds(zm1, zm2, ptr_b[rax+128]); diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index c3c6c8be..dc2e1adf 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1423,12 +1423,6 @@ void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } -void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD3); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); } @@ -2451,6 +2445,12 @@ void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); }