Skip to content

Commit

Permalink
Zero register optimization for AVX-512-VBMI
Browse files Browse the repository at this point in the history
- Take advantage of the fact that AVX instructions zero the upper 128 bits for a nice optimization when one input vector is zeroed
  • Loading branch information
Whatcookie committed Jul 21, 2023
1 parent 38a5313 commit 81aa8ac
Showing 1 changed file with 53 additions and 0 deletions.
53 changes: 53 additions & 0 deletions rpcs3/Emu/Cell/SPURecompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8619,6 +8619,21 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator
{
if (data == v128::from8p(data._u8[0]))
{
if (m_use_avx512_icl)
{
if (perm_only)
{
set_vr(op.rt4, vperm2b256to128(as, b, c));
return;
}

const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
const auto ab = vperm2b256to128(as, b, c);
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
return;

}
// See above
const auto x = pshufb(build<u8[16]>(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x80, 0x80), (c >> 4));
const auto ax = pshufb(as, c);
Expand Down Expand Up @@ -8653,6 +8668,44 @@ class spu_llvm_recompiler : public spu_recompiler_base, public cpu_translator

if (m_use_avx512_icl && (op.ra != op.rb || m_interp_magn))
{
if (auto [ok, data] = get_const_vector(b.value, m_pos); ok)
{
if (data == v128::from8p(data._u8[0]))
{
if (perm_only)
{
set_vr(op.rt4, vperm2b256to128(a, b, eval(c ^ 0xf)));
return;
}

const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
const auto ab = vperm2b256to128(a, b, eval(c ^ 0xf));
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
return;

}
}

if (auto [ok, data] = get_const_vector(a.value, m_pos); ok)
{
if (data == v128::from8p(data._u8[0]))
{
if (perm_only)
{
set_vr(op.rt4, vperm2b256to128(b, a, eval(c ^ 0x1f)));
return;
}

const auto m = gf2p8affineqb(c, build<u8[16]>(0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x40, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20), 0x7f);
const auto mm = select(noncast<s8[16]>(m) >= 0, splat<u8[16]>(0), m);
const auto ab = vperm2b256to128(b, a, eval(c ^ 0x1f));
set_vr(op.rt4, select(noncast<s8[16]>(c) >= 0, ab, mm));
return;

}
}

if (perm_only)
{
set_vr(op.rt4, vperm2b(a, b, eval(c ^ 0xf)));
Expand Down

0 comments on commit 81aa8ac

Please sign in to comment.