-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[API Proposal]: AVX-512 VPOPCNTDQ
and BITALG
Intrinsics
#96162
Comments
Tagging subscribers to this area: @dotnet/area-system-runtime-intrinsics Issue DetailsBackground and motivationBoth API Proposalnamespace System.Runtime.Intrinsics.X86;
[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512Vpopcntdq : Avx512DQ
{
public static new bool IsSupported { get; }
[Intrinsic]
public new abstract class X64 : Avx512DQ.X64
{
internal X64() { }
public static new bool IsSupported { get; }
}
public static Vector512<int> PopCount(Vector512<int> value);
public static Vector512<uint> PopCount(Vector512<uint> value);
public static Vector512<long> PopCount(Vector512<long> value);
public static Vector512<ulong> PopCount(Vector512<ulong> value);
public abstract class VL : Avx512DQ.VL
{
public static new bool IsSupported { get; }
public static Vector256<int> PopCount(Vector256<int> value);
public static Vector256<uint> PopCount(Vector256<uint> value);
public static Vector256<long> PopCount(Vector256<long> value);
public static Vector256<ulong> PopCount(Vector256<ulong> value);
public static Vector128<int> PopCount(Vector128<int> value);
public static Vector128<uint> PopCount(Vector128<uint> value);
public static Vector128<long> PopCount(Vector128<long> value);
public static Vector128<ulong> PopCount(Vector128<ulong> value);
}
}
[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512BitAlg : Avx512BW
{
public static new bool IsSupported { get; }
[Intrinsic]
public new abstract class X64 : Avx512BW.X64
{
internal X64() { }
public static new bool IsSupported { get; }
}
public static Vector512<short> PopCount(Vector512<short> value);
public static Vector512<ushort> PopCount(Vector512<ushort> value);
public static Vector512<byte> PopCount(Vector512<byte> value);
public static Vector512<sbyte> PopCount(Vector512<sbyte> value);
public static Vector512<byte> BitShuffle(Vector512<ulong> value, Vector512<byte> control);
public static Vector512<sbyte> BitShuffle(Vector512<long> value, Vector512<sbyte> control);
public static Vector512<byte> MaskBitShuffle(Vector512<byte> mask, Vector512<ulong> value, Vector512<byte> control);
public static Vector512<sbyte> MaskBitShuffle(Vector512<sbyte> mask, Vector512<long> value, Vector512<sbyte> control);
public abstract class VL : Avx512BW.VL
{
public static new bool IsSupported { get; }
public static Vector256<short> PopCount(Vector256<short> value);
public static Vector256<ushort> PopCount(Vector256<ushort> value);
public static Vector256<byte> PopCount(Vector256<byte> value);
public static Vector256<sbyte> PopCount(Vector256<sbyte> value);
public static Vector128<short> PopCount(Vector128<short> value);
public static Vector128<ushort> PopCount(Vector128<ushort> value);
public static Vector128<byte> PopCount(Vector128<byte> value);
public static Vector128<sbyte> PopCount(Vector128<sbyte> value);
public static Vector256<byte> BitShuffle(Vector256<ulong> value, Vector256<byte> control);
public static Vector256<sbyte> BitShuffle(Vector256<long> value, Vector256<sbyte> control);
public static Vector256<byte> MaskBitShuffle(Vector256<byte> mask, Vector256<ulong> value, Vector256<byte> control);
public static Vector256<sbyte> MaskBitShuffle(Vector256<sbyte> mask, Vector256<long> value, Vector256<sbyte> control);
public static Vector128<byte> BitShuffle(Vector128<ulong> value, Vector128<byte> control);
public static Vector128<sbyte> BitShuffle(Vector128<long> value, Vector128<sbyte> control);
public static Vector128<byte> MaskBitShuffle(Vector128<byte> mask, Vector128<ulong> value, Vector128<byte> control);
public static Vector128<sbyte> MaskBitShuffle(Vector128<sbyte> mask, Vector128<long> value, Vector128<sbyte> control);
}
} API Usagevar blocksPerRows = Avx512BitAlg.PopCount(board); Alternative Designs
RisksNone
|
Returning a vector is consistent with how the other APIs that "return masks" work and should mesh with the existing pattern recognition allowing it to be consumed directly as a mask. |
namespace System.Runtime.Intrinsics.X86;
[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512VPopcntDQ : Avx512DQ
{
public static new bool IsSupported { get; }
[Intrinsic]
public new abstract class X64 : Avx512DQ.X64
{
public static new bool IsSupported { get; }
}
public static Vector512<int> PopCount(Vector512<int> value);
public static Vector512<uint> PopCount(Vector512<uint> value);
public static Vector512<long> PopCount(Vector512<long> value);
public static Vector512<ulong> PopCount(Vector512<ulong> value);
public abstract class VL : Avx512DQ.VL
{
public static new bool IsSupported { get; }
public static Vector256<int> PopCount(Vector256<int> value);
public static Vector256<uint> PopCount(Vector256<uint> value);
public static Vector256<long> PopCount(Vector256<long> value);
public static Vector256<ulong> PopCount(Vector256<ulong> value);
public static Vector128<int> PopCount(Vector128<int> value);
public static Vector128<uint> PopCount(Vector128<uint> value);
public static Vector128<long> PopCount(Vector128<long> value);
public static Vector128<ulong> PopCount(Vector128<ulong> value);
}
}
[Intrinsic]
[CLSCompliant(false)]
public abstract class Avx512BitAlg : Avx512BW
{
public static new bool IsSupported { get; }
[Intrinsic]
public new abstract class X64 : Avx512BW.X64
{
public static new bool IsSupported { get; }
}
public static Vector512<short> PopCount(Vector512<short> value);
public static Vector512<ushort> PopCount(Vector512<ushort> value);
public static Vector512<byte> PopCount(Vector512<byte> value);
public static Vector512<sbyte> PopCount(Vector512<sbyte> value);
public static Vector512<byte> ShuffleBits(Vector512<ulong> value, Vector512<byte> control);
public static Vector512<sbyte> ShuffleBits(Vector512<long> value, Vector512<sbyte> control);
public static Vector512<byte> MaskShuffleBits(Vector512<byte> mask, Vector512<ulong> value, Vector512<byte> control);
public static Vector512<sbyte> MaskShuffleBits(Vector512<sbyte> mask, Vector512<long> value, Vector512<sbyte> control);
public abstract class VL : Avx512BW.VL
{
public static new bool IsSupported { get; }
public static Vector256<short> PopCount(Vector256<short> value);
public static Vector256<ushort> PopCount(Vector256<ushort> value);
public static Vector256<byte> PopCount(Vector256<byte> value);
public static Vector256<sbyte> PopCount(Vector256<sbyte> value);
public static Vector128<short> PopCount(Vector128<short> value);
public static Vector128<ushort> PopCount(Vector128<ushort> value);
public static Vector128<byte> PopCount(Vector128<byte> value);
public static Vector128<sbyte> PopCount(Vector128<sbyte> value);
public static Vector256<byte> ShuffleBits(Vector256<ulong> value, Vector256<byte> control);
public static Vector256<sbyte> ShuffleBits(Vector256<long> value, Vector256<sbyte> control);
public static Vector256<byte> MaskShuffleBits(Vector256<byte> mask, Vector256<ulong> value, Vector256<byte> control);
public static Vector256<sbyte> MaskShuffleBits(Vector256<sbyte> mask, Vector256<long> value, Vector256<sbyte> control);
public static Vector128<byte> ShuffleBits(Vector128<ulong> value, Vector128<byte> control);
public static Vector128<sbyte> ShuffleBits(Vector128<long> value, Vector128<sbyte> control);
public static Vector128<byte> MaskShuffleBits(Vector128<byte> mask, Vector128<ulong> value, Vector128<byte> control);
public static Vector128<sbyte> MaskShuffleBits(Vector128<sbyte> mask, Vector128<long> value, Vector128<sbyte> control);
}
} |
This would be wonderful for our internal algorithm library as well. We need to compute hamming distances on larger number sets, and using the popcount x64 intrinsic is a limiting factor in our implementation. |
Background and motivation
Both
VPOPCNTDQ
andBITALG
are supported by Intel in the Ice Lake and newer architectures, and by AMD in Zen 4.VPOPCNTDQ
allows for parallelpopcnt
in either Vector128, Vector256, or Vector512 forulong
anduint
.BITALG
expands parallelpopcnt
forushort
andbyte
, and it also addsVPSHUFBITQMB
instruction, which performs a bit gather select.VPOPCNTW
is highly beneficial for my current project, allowing me for counting filled blocks row-by-row for a bit-board of block games.API Proposal
API Usage
Alternative Designs
Avx512Vpopcntdq
could have a different name.MaskBitShuffle
could have a different name and/or parameter/return types (e.g.mask
and return type could beulong
instead ofVector512<byte>
).Risks
None
The text was updated successfully, but these errors were encountered: