-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[arm64] Unexpected performance degradation for Vector128<short> #73921
Comments
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch Issue DetailsWhile working on #73768 I've re-implemented
Repro: public class Repro
{
private byte[] _bytes;
private short[] _shorts;
public Repro()
{
_bytes = new byte[512];
_bytes[_bytes.Length / 2] = 1;
_shorts = new short[512];
_shorts[_shorts.Length / 2] = 1;
}
[Benchmark]
public int Byte_FirstIndex_BuiltIn() => new ReadOnlySpan<byte>(_bytes).IndexOf((byte)1);
[Benchmark]
public int Byte_Lastndex_BuiltIn() => new ReadOnlySpan<byte>(_bytes).LastIndexOf((byte)1);
[Benchmark]
public int Short_FirstIndex_BuiltIn() => new ReadOnlySpan<short>(_shorts).IndexOf((short)1);
[Benchmark]
public int Short_Lastndex_BuiltIn() => new ReadOnlySpan<short>(_shorts).LastIndexOf((short)1);
[Benchmark]
public int Byte_FirstIndex() => IndexOfValueType<byte>(ref _bytes[0], 1, _bytes.Length);
[Benchmark]
public int Byte_Lastndex() => LastIndexOfValueType<byte>(ref _bytes[0], 1, _bytes.Length);
[Benchmark]
public int Short_FirstIndex() => IndexOfValueType<short>(ref _shorts[0], 1, _shorts.Length);
[Benchmark]
public int Short_Lastndex() => LastIndexOfValueType<short>(ref _shorts[0], 1, _shorts.Length);
// this method is performant for bytes and shorts
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
private static int IndexOfValueType<T>(ref T searchSpace, T value, int length) where T : struct, INumber<T>
{
if (!Vector128.IsHardwareAccelerated || length < Vector128<T>.Count)
{
// removed for brevity
}
else
{
Vector128<T> equals, values = Vector128.Create(value);
ref T currentSearchSpace = ref searchSpace;
ref T oneVectorAwayFromEnd = ref Unsafe.Add(ref searchSpace, length - Vector128<T>.Count);
// Loop until either we've finished all elements or there's less than a vector's-worth remaining.
do
{
equals = Vector128.Equals(values, Vector128.LoadUnsafe(ref currentSearchSpace));
if (equals == Vector128<T>.Zero)
{
currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector128<T>.Count);
continue;
}
return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, equals);
}
while (!Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref oneVectorAwayFromEnd));
// If any elements remain, process the first vector in the search space.
if ((uint)length % Vector128<T>.Count != 0)
{
equals = Vector128.Equals(values, Vector128.LoadUnsafe(ref oneVectorAwayFromEnd));
if (equals != Vector128<T>.Zero)
{
return ComputeFirstIndex(ref searchSpace, ref oneVectorAwayFromEnd, equals);
}
}
}
return -1;
}
// this method is performant for bytes, but NOT for shorts
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
private static int LastIndexOfValueType<T>(ref T searchSpace, T value, int length)
where T : struct, INumber<T>
{
if (!Vector128.IsHardwareAccelerated || length < Vector128<T>.Count)
{
// removed for brevity
}
else
{
Vector128<T> equals, values = Vector128.Create(value);
ref T currentSearchSpace = ref Unsafe.Add(ref searchSpace, length - Vector128<T>.Count);
// Loop until either we've finished all elements or there's less than a vector's-worth remaining.
do
{
equals = Vector128.Equals(values, Vector128.LoadUnsafe(ref currentSearchSpace));
if (equals == Vector128<T>.Zero)
{
currentSearchSpace = ref Unsafe.Subtract(ref currentSearchSpace, Vector128<T>.Count);
continue;
}
return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, equals);
}
while (!Unsafe.IsAddressLessThan(ref currentSearchSpace, ref searchSpace));
// If any elements remain, process the first vector in the search space.
if ((uint)length % Vector128<T>.Count != 0)
{
equals = Vector128.Equals(values, Vector128.LoadUnsafe(ref searchSpace));
if (equals != Vector128<T>.Zero)
{
return ComputeLastIndex(ref searchSpace, ref searchSpace, equals);
}
}
}
return -1;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int ComputeFirstIndex<T>(ref T searchSpace, ref T current, Vector128<T> equals) where T : struct
{
uint notEqualsElements = equals.ExtractMostSignificantBits();
int index = BitOperations.TrailingZeroCount(notEqualsElements);
return index + (int)((long)Unsafe.ByteOffset(ref searchSpace, ref current) / Unsafe.SizeOf<T>());
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int ComputeLastIndex<T>(ref T searchSpace, ref T current, Vector128<T> equals) where T : struct
{
uint notEqualsElements = equals.ExtractMostSignificantBits();
int index = 31 - BitOperations.LeadingZeroCount(notEqualsElements); // 31 = 32 (bits in Int32) - 1 (indexing from zero)
return (int)((long)Unsafe.ByteOffset(ref searchSpace, ref current) / Unsafe.SizeOf<T>()) + index;
}
} BenchmarkDotNet=v0.13.1.1845-nightly, OS=ubuntu 20.04
Unknown processor
.NET SDK=7.0.100-rc.1.22413.1
[Host] : .NET 7.0.0 (7.0.22.41112), Arm64 RyuJIT AdvSIMD
Job-UQDAMS : .NET 7.0.0 (7.0.22.41112), Arm64 RyuJIT AdvSIMD
cc @EgorBo @kunalspathak @AndyAyersMS
|
@adamsitnik I can't repro it on Main and Preview7 I get
|
@EgorBo thanks for the sanity check! I've synced my fork again, performed full build (previously was building only a subset of CoreLib) and I also can't repro it now. |
While working on #73768 I've re-implemented
Span.IndexOf
andSpan.LastIndexOf
by usingVector128<T>
with generic implementation that can be used forbyte
,short
,int
andlong
.IndexOfValueType
andLastIndexOfValueType
are very similar, but the first is performant for bothbyte
andshort
, while the latter only forbyte
. It's very likely related to #73804Repro:
cc @EgorBo @kunalspathak @AndyAyersMS
The text was updated successfully, but these errors were encountered: