-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Avx512 support to ProbabilisticMap #96931
Conversation
Tagging subscribers to this area: @dotnet/area-system-memory Issue Detailspublic class ProbMap
{
private static readonly SearchValues<char> s_values = SearchValues.Create("abcABC123ű");
private string _text = "ő";
[Params(16, 32, 64, 128, 256, 10_000)]
public int Length;
[GlobalSetup]
public void Setup() => _text += new string('\n', Length - 1);
[Benchmark]
public int IndexOfAny() => _text.AsSpan().IndexOfAny(s_values);
}
Besides just doubling the vector width, Vector256<byte> bitMaskLower = Avx2.Shuffle(charMapLower, index);
Vector256<byte> bitMaskUpper = Avx2.Shuffle(charMapUpper, index - Vector256.Create((byte)16));
Vector256<byte> mask = Vector256.GreaterThan(index, Vector256.Create((byte)15));
Vector256<byte> bitMask = Vector256.ConditionalSelect(mask, bitMaskUpper, bitMaskLower); with Vector256<byte> bitMask = Avx512Vbmi.VL.PermuteVar32x8(charMap, index); AVX2 vs AVX512 main loop// AVX2 loop to process 32 characters // AVX512 loop to process 64 characters
M02_L00: M02_L00:
vmovups ymm4,[r14] vmovups zmm1,[rbx]
vmovups ymm5,[r14+20] vmovups zmm2,[rbx+40]
vmovups ymm6,[7FC7B99432C0] vmovups zmm3,[7F3FF2744E40]
vpand ymm7,ymm4,ymm6 vpandd zmm4,zmm1,zmm3
vpand ymm6,ymm5,ymm6 vpandd zmm3,zmm2,zmm3
vpackuswb ymm6,ymm7,ymm6 vpackuswb zmm3,zmm4,zmm3
vpsrlw ymm4,ymm4,8 vpsrlw zmm1,zmm1,8
vpsrlw ymm5,ymm5,8 vpsrlw zmm2,zmm2,8
vpackuswb ymm4,ymm4,ymm5 vpackuswb zmm1,zmm1,zmm2
vpsrld ymm5,ymm6,5 vmovups zmm2,[7F3FF2744E80]
vmovups ymm7,[7FC7B99432E0] vpandd zmm4,zmm3,zmm2
vpand ymm5,ymm5,ymm7 vpermb zmm4,zmm4,zmm0
vmovups ymm8,[7FC7B9943300] vpsrld zmm3,zmm3,5
vpshufb ymm5,ymm8,ymm5 vmovups zmm5,[7F3FF2744EC0]
vmovups ymm8,[7FC7B9943320] vpandd zmm3,zmm3,zmm5
vpand ymm6,ymm6,ymm8 vmovups zmm6,[7F3FF2744F00]
vmovups ymm9,[7FC7B9943340] vpshufb zmm3,zmm6,zmm3
vpsubb ymm10,ymm6,ymm9 vpandd zmm3,zmm4,zmm3
vpcmpgtb ymm10,ymm10,[7FC7B9943360] vptestnmb k1,zmm3,zmm3
vmovups ymm11,[7FC7B9943380] vpandd zmm2,zmm1,zmm2
vpsubb ymm12,ymm6,ymm11 vpermb zmm2,zmm2,zmm0
vpshufb ymm12,ymm3,ymm12 vpsrld zmm1,zmm1,5
vpshufb ymm6,ymm2,ymm6 vpandd zmm1,zmm1,zmm5
vpblendvb ymm6,ymm6,ymm12,ymm10 vpshufb zmm1,zmm6,zmm1
vpand ymm5,ymm6,ymm5 vpandd zmm1,zmm2,zmm1
vxorps ymm6,ymm6,ymm6 vptestnmb k2,zmm1,zmm1
vpcmpeqb ymm5,ymm5,ymm6 korq k1,k1,k2
vpsrld ymm6,ymm4,5 vpmovm2b zmm1,k1
vpand ymm6,ymm6,ymm7 vpternlogd zmm2,zmm2,zmm2,0FF
vmovups ymm7,[7FC7B9943300] vpxord zmm1,zmm1,zmm2
vpshufb ymm6,ymm7,ymm6 vptestmb k1,zmm1,zmm1
vpand ymm4,ymm4,ymm8 kortestq k1,k1
vpsubb ymm7,ymm4,ymm9 jne short M02_L03
vpcmpgtb ymm7,ymm7,[7FC7B9943360]
vpsubb ymm8,ymm4,ymm11
vpshufb ymm8,ymm3,ymm8
vpshufb ymm4,ymm2,ymm4
vpblendvb ymm4,ymm4,ymm8,ymm7
vpand ymm4,ymm4,ymm6
vxorps ymm6,ymm6,ymm6
vpcmpeqb ymm4,ymm4,ymm6
vpor ymm4,ymm5,ymm4
vpcmpeqd ymm5,ymm5,ymm5
vpxor ymm4,ymm4,ymm5
vptest ymm4,ymm4
jne near ptr M02_L10
M02_L01: M02_L01:
add r14,40 add rbx,80
cmp r14,r13 cmp rbx,r13
vmovups ymm2,[rbp-70] vmovups zmm0,[rbp-70]
vmovups ymm3,[rbp-90] jbe near ptr M02_L00
jbe near ptr M02_L00
|
@MihaZupan is this just pending review or is it blocked on something? Noticed you have it self-assigned so want to make sure. If it is ready for review, could you click the |
0d21d17
to
68a5b3f
Compare
It's ready for review, I updated the branch now |
Besides just doubling the vector width,
Avx512Vbmi.VL
gives us access to better permute instructions, allowing us to replacewith
AVX2 vs AVX512 main loop