Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

port SpanHelpers.IndexOfAny(ref byte, byte, byte, int) to Vector128/256 #73556

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
333 changes: 50 additions & 283 deletions src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs
Original file line number Diff line number Diff line change
Expand Up @@ -774,319 +774,86 @@ public static int LastIndexOf(ref byte searchSpace, byte value, int length)
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
public static int IndexOfAny(ref byte searchSpace, byte value0, byte value1, int length)
{
Debug.Assert(length >= 0);

uint uValue0 = value0; // Use uint for comparisons to avoid unnecessary 8->32 extensions
uint uValue1 = value1; // Use uint for comparisons to avoid unnecessary 8->32 extensions
nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations
nuint lengthToExamine = (nuint)(uint)length;

if (Sse2.IsSupported || AdvSimd.Arm64.IsSupported)
{
// Avx2 branch also operates on Sse2 sizes, so check is combined.
nint vectorDiff = (nint)length - Vector128<byte>.Count;
if (vectorDiff >= 0)
{
// >= Sse2 intrinsics are supported, and length is enough to use them so use that path.
// We jump forward to the intrinsics at the end of the method so a naive branch predict
// will choose the non-intrinsic path so short lengths which don't gain anything aren't
// overly disadvantaged by having to jump over a lot of code. Whereas the longer lengths
// more than make this back from the intrinsics.
lengthToExamine = (nuint)vectorDiff;
goto IntrinsicsCompare;
}
}
else if (Vector.IsHardwareAccelerated)
{
// Calculate lengthToExamine here for test, as it is used later
nint vectorDiff = (nint)length - Vector<byte>.Count;
if (vectorDiff >= 0)
{
// Similar as above for Vector version
lengthToExamine = (nuint)vectorDiff;
goto IntrinsicsCompare;
}
}
nuint offset = 0;

uint lookUp;
while (lengthToExamine >= 8)
if (Vector256.IsHardwareAccelerated && length >= Vector256<byte>.Count)
{
lengthToExamine -= 8;

lookUp = Unsafe.AddByteOffset(ref searchSpace, offset);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found1;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found2;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found3;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 4);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found4;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 5);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found5;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 6);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found6;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 7);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found7;

offset += 8;
}

if (lengthToExamine >= 4)
{
lengthToExamine -= 4;

lookUp = Unsafe.AddByteOffset(ref searchSpace, offset);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 1);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found1;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 2);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found2;
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset + 3);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found3;

offset += 4;
}

while (lengthToExamine > 0)
{

lookUp = Unsafe.AddByteOffset(ref searchSpace, offset);
if (uValue0 == lookUp || uValue1 == lookUp)
goto Found;

offset += 1;
lengthToExamine -= 1;
}

NotFound:
return -1;
Found: // Workaround for https://github.com/dotnet/runtime/issues/8795
return (int)offset;
Found1:
return (int)(offset + 1);
Found2:
return (int)(offset + 2);
Found3:
return (int)(offset + 3);
Found4:
return (int)(offset + 4);
Found5:
return (int)(offset + 5);
Found6:
return (int)(offset + 6);
Found7:
return (int)(offset + 7);

IntrinsicsCompare:
// When we move into a Vectorized block, we process everything of Vector size;
// and then for any remainder we do a final compare of Vector size but starting at
// the end and forwards, which may overlap on an earlier compare.

// We include the Supported check again here even though path will not be taken, so the asm isn't generated if not supported.
if (Sse2.IsSupported)
{
int matches;
if (Avx2.IsSupported)
{
Vector256<byte> search;
// Guard as we may only have a valid size for Vector128; when we will move to the Sse2
// We have already subtracted Vector128<byte>.Count from lengthToExamine so compare against that
// to see if we have double the size for Vector256<byte>.Count
if (lengthToExamine >= (nuint)Vector128<byte>.Count)
{
Vector256<byte> values0 = Vector256.Create(value0);
Vector256<byte> values1 = Vector256.Create(value1);

// Subtract Vector128<byte>.Count so we have now subtracted Vector256<byte>.Count
lengthToExamine -= (nuint)Vector128<byte>.Count;
// First time this checks again against 0, however we will move into final compare if it fails.
while (lengthToExamine > offset)
{
search = LoadVector256(ref searchSpace, offset);
// Bitwise Or to combine the flagged matches for the second value to our match flags
matches = Avx2.MoveMask(
Avx2.Or(
Avx2.CompareEqual(values0, search),
Avx2.CompareEqual(values1, search)));
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
{
// None matched
offset += (nuint)Vector256<byte>.Count;
continue;
}

goto IntrinsicsMatch;
}

// Move to Vector length from end for final compare
search = LoadVector256(ref searchSpace, lengthToExamine);
offset = lengthToExamine;
// Same as method as above
matches = Avx2.MoveMask(
Avx2.Or(
Avx2.CompareEqual(values0, search),
Avx2.CompareEqual(values1, search)));
if (matches == 0)
{
// None matched
goto NotFound;
}
nuint lastVectorOffset = GetByteVector256SpanLength(offset, length);
Vector256<byte> values0 = Vector256.Create(value0), values1 = Vector256.Create(value1), search, match;

goto IntrinsicsMatch;
}
}

// Initial size check was done on method entry.
Debug.Assert(length >= Vector128<byte>.Count);
do
{
Vector128<byte> search;
Vector128<byte> values0 = Vector128.Create(value0);
Vector128<byte> values1 = Vector128.Create(value1);
// First time this checks against 0 and we will move into final compare if it fails.
while (lengthToExamine > offset)
search = Vector256.LoadUnsafe(ref searchSpace, offset);
match = Vector256.Equals(values0, search) | Vector256.Equals(values1, search);
if (match == Vector256<byte>.Zero)
{
search = LoadVector128(ref searchSpace, offset);

matches = Sse2.MoveMask(
Sse2.Or(
Sse2.CompareEqual(values0, search),
Sse2.CompareEqual(values1, search))
.AsByte());
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
if (matches == 0)
{
// None matched
offset += (nuint)Vector128<byte>.Count;
continue;
}

goto IntrinsicsMatch;
}
// Move to Vector length from end for final compare
search = LoadVector128(ref searchSpace, lengthToExamine);
offset = lengthToExamine;
// Same as method as above
matches = Sse2.MoveMask(
Sse2.Or(
Sse2.CompareEqual(values0, search),
Sse2.CompareEqual(values1, search)));
if (matches == 0)
{
// None matched
goto NotFound;
offset += (nuint)Vector256<byte>.Count;
continue;
}
}

IntrinsicsMatch:
// Find bitflag offset of first difference and add to current offset
offset += (nuint)BitOperations.TrailingZeroCount(matches);
goto Found;
return GetIndexFromMask(offset, match.ExtractMostSignificantBits());
} while (offset < lastVectorOffset);
}
else if (AdvSimd.Arm64.IsSupported)
// Vector128 code path is executed for Vector256 when Vector128<byte>.Count <= length < Vector256<byte>.Count
else if (Vector128.IsHardwareAccelerated && length >= Vector128<byte>.Count)
{
Vector128<byte> search;
Vector128<byte> matches;
Vector128<byte> values0 = Vector128.Create(value0);
Vector128<byte> values1 = Vector128.Create(value1);
// First time this checks against 0 and we will move into final compare if it fails.
while (lengthToExamine > offset)
{
search = LoadVector128(ref searchSpace, offset);

matches = AdvSimd.Or(
AdvSimd.CompareEqual(values0, search),
AdvSimd.CompareEqual(values1, search));
nuint lastVectorOffset = GetByteVector128SpanLength(offset, length);
Vector128<byte> values0 = Vector128.Create(value0), values1 = Vector128.Create(value1), search, match;

if (matches == Vector128<byte>.Zero)
do
{
search = Vector128.LoadUnsafe(ref searchSpace, offset);
match = Vector128.Equals(values0, search) | Vector128.Equals(values1, search);
if (match == Vector128<byte>.Zero)
{
offset += (nuint)Vector128<byte>.Count;
continue;
}

// Find bitflag offset of first match and add to current offset
offset += FindFirstMatchedLane(matches);

goto Found;
}

// Move to Vector length from end for final compare
search = LoadVector128(ref searchSpace, lengthToExamine);
offset = lengthToExamine;
// Same as method as above
matches = AdvSimd.Or(
AdvSimd.CompareEqual(values0, search),
AdvSimd.CompareEqual(values1, search));

if (matches == Vector128<byte>.Zero)
{
// None matched
goto NotFound;
}

// Find bitflag offset of first match and add to current offset
offset += FindFirstMatchedLane(matches);

goto Found;
return GetIndexFromMask(offset, match.ExtractMostSignificantBits());
} while (offset < lastVectorOffset);
}
else if (Vector.IsHardwareAccelerated)
else if (Vector.IsHardwareAccelerated && length >= Vector<byte>.Count)
{
Vector<byte> values0 = new Vector<byte>(value0);
Vector<byte> values1 = new Vector<byte>(value1);
nuint lastVectorOffset = GetByteVectorSpanLength(offset, length);
Vector<byte> values0 = new Vector<byte>(value0), values1 = new Vector<byte>(value1), search, match;

Vector<byte> search;
// First time this checks against 0 and we will move into final compare if it fails.
while (lengthToExamine > offset)
do
{
search = LoadVector(ref searchSpace, offset);
search = Vector.BitwiseOr(
Vector.Equals(search, values0),
Vector.Equals(search, values1));
if (Vector<byte>.Zero.Equals(search))
match = Vector.BitwiseOr(Vector.Equals(search, values0), Vector.Equals(search, values1));
if (Vector<byte>.Zero.Equals(match))
{
// None matched
offset += (nuint)Vector<byte>.Count;
continue;
}

goto VectorMatch;
}
return (int)offset + LocateFirstFoundByte(search);
} while (offset < lastVectorOffset);
}

// Move to Vector length from end for final compare
search = LoadVector(ref searchSpace, lengthToExamine);
offset = lengthToExamine;
search = Vector.BitwiseOr(
Vector.Equals(search, values0),
Vector.Equals(search, values1));
if (Vector<byte>.Zero.Equals(search))
// Use uint for comparisons to avoid unnecessary 8->32 extensions
uint uValue0 = value0, uValue1 = value1, lookUp;
// Use nuint for offset to avoid unnecessary 64->32->64 truncations
for (; offset < (uint)length; offset++)
{
lookUp = Unsafe.AddByteOffset(ref searchSpace, offset);
if (uValue0 == lookUp || uValue1 == lookUp)
{
// None matched
goto NotFound;
return (int)offset;
}

VectorMatch:
offset += (nuint)LocateFirstFoundByte(search);
goto Found;
}

Debug.Fail("Unreachable");
goto NotFound;
return -1;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
static int GetIndexFromMask(nuint offset, uint matches)
{
// Note that ExtractMostSignificantBits has converted the equal vector elements into a set of bit flags,
// So the bit position in 'matches' corresponds to the element offset.
// Find bitflag offset of first difference and add to current offset
return (int)offset + BitOperations.TrailingZeroCount(matches);
}
}

[MethodImpl(MethodImplOptions.AggressiveOptimization)]
Expand Down