From ef9e53b1399b0e363db5bcdd704ef38057812365 Mon Sep 17 00:00:00 2001 From: Sven Boemer Date: Mon, 24 Apr 2023 20:12:47 +0000 Subject: [PATCH] Inline more Avx2 helpers --- .../IndexOfAnyAsciiSearcher.cs | 187 +++++++++--------- .../src/System/SpanHelpers.Packed.cs | 99 ++++++---- 2 files changed, 149 insertions(+), 137 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs index c39639d24f826..92b588d06f494 100644 --- a/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/IndexOfAnyValues/IndexOfAnyAsciiSearcher.cs @@ -195,7 +195,12 @@ internal static int IndexOfAnyVectorized(ref short sea Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); if (result != Vector256.Zero) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); + + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / (nuint)sizeof(short)); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256.Count); @@ -219,7 +224,18 @@ internal static int IndexOfAnyVectorized(ref short sea Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); if (result != Vector256.Zero) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); + + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + if (offsetInVector >= Vector256.Count) + { + // We matched within the second vector + firstVector = ref oneVectorAwayFromEnd; + offsetInVector -= Vector256.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref firstVector) / (nuint)sizeof(short)); } } @@ -307,7 +323,12 @@ internal static int LastIndexOfAnyVectorized(ref short Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); if (result != Vector256.Zero) { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); + + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / (nuint)sizeof(short)); } } while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref twoVectorsAfterStart)); @@ -329,7 +350,18 @@ internal static int LastIndexOfAnyVectorized(ref short Vector256 result = IndexOfAnyLookup(source0, source1, bitmap256); if (result != Vector256.Zero) { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + result = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); + + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + if (offsetInVector < Vector256.Count) + { + return offsetInVector; + } + + // We matched within the second vector + return offsetInVector - Vector256.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(short)); } } @@ -411,7 +443,10 @@ internal static int IndexOfAnyVectorized(ref byte searchSpace, int sea Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); if (result != Vector256.Zero) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / (nuint)sizeof(byte)); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256.Count); @@ -436,7 +471,16 @@ internal static int IndexOfAnyVectorized(ref byte searchSpace, int sea Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); if (result != Vector256.Zero) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + if (offsetInVector >= Vector256.Count) + { + // We matched within the second vector + firstVector = ref halfVectorAwayFromEnd; + offsetInVector -= Vector256.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref firstVector) / (nuint)sizeof(byte)); } } @@ -518,7 +562,10 @@ internal static int LastIndexOfAnyVectorized(ref byte searchSpace, int Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); if (result != Vector256.Zero) { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / (nuint)sizeof(byte)); } } while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); @@ -541,7 +588,16 @@ internal static int LastIndexOfAnyVectorized(ref byte searchSpace, int Vector256 result = TNegator.NegateIfNeeded(IndexOfAnyLookupCore(source, bitmap256)); if (result != Vector256.Zero) { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + if (offsetInVector < Vector256.Count) + { + return offsetInVector; + } + + // We matched within the second vector + return offsetInVector - Vector256.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(byte)); } } @@ -622,7 +678,10 @@ internal static int IndexOfAnyVectorized(ref byte searchSpace, int sea Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); if (result != Vector256.Zero) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / (nuint)sizeof(byte)); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, Vector256.Count); @@ -647,7 +706,16 @@ internal static int IndexOfAnyVectorized(ref byte searchSpace, int sea Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); if (result != Vector256.Zero) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref halfVectorAwayFromEnd, result); + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = BitOperations.TrailingZeroCount(mask); + if (offsetInVector >= Vector256.Count) + { + // We matched within the second vector + firstVector = ref halfVectorAwayFromEnd; + offsetInVector -= Vector256.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref firstVector) / (nuint)sizeof(byte)); } } @@ -730,7 +798,10 @@ internal static int LastIndexOfAnyVectorized(ref byte searchSpace, int Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); if (result != Vector256.Zero) { - return ComputeLastIndex(ref searchSpace, ref currentSearchSpace, result); + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / (nuint)sizeof(byte)); } } while (Unsafe.IsAddressGreaterThan(ref currentSearchSpace, ref vectorAfterStart)); @@ -753,7 +824,16 @@ internal static int LastIndexOfAnyVectorized(ref byte searchSpace, int Vector256 result = IndexOfAnyLookup(source, bitmap256_0, bitmap256_1); if (result != Vector256.Zero) { - return ComputeLastIndexOverlapped(ref searchSpace, ref secondVector, result); + uint mask = TNegator.ExtractMask(result); + + int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); + if (offsetInVector < Vector256.Count) + { + return offsetInVector; + } + + // We matched within the second vector + return offsetInVector - Vector256.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(byte)); } } @@ -992,89 +1072,6 @@ private static unsafe int ComputeLastIndexOverlapped(ref T searchSp return offsetInVector - Vector128.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(T)); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int ComputeFirstIndex(ref T searchSpace, ref T current, Vector256 result) - where TNegator : struct, INegator - { - if (typeof(T) == typeof(short)) - { - result = FixUpPackedVector256Result(result); - } - - uint mask = TNegator.ExtractMask(result); - - int offsetInVector = BitOperations.TrailingZeroCount(mask); - return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int ComputeFirstIndexOverlapped(ref T searchSpace, ref T current0, ref T current1, Vector256 result) - where TNegator : struct, INegator - { - if (typeof(T) == typeof(short)) - { - result = FixUpPackedVector256Result(result); - } - - uint mask = TNegator.ExtractMask(result); - - int offsetInVector = BitOperations.TrailingZeroCount(mask); - if (offsetInVector >= Vector256.Count) - { - // We matched within the second vector - current0 = ref current1; - offsetInVector -= Vector256.Count; - } - return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / (nuint)sizeof(T)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int ComputeLastIndex(ref T searchSpace, ref T current, Vector256 result) - where TNegator : struct, INegator - { - if (typeof(T) == typeof(short)) - { - result = FixUpPackedVector256Result(result); - } - - uint mask = TNegator.ExtractMask(result); - - int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); - return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / (nuint)sizeof(T)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static unsafe int ComputeLastIndexOverlapped(ref T searchSpace, ref T secondVector, Vector256 result) - where TNegator : struct, INegator - { - if (typeof(T) == typeof(short)) - { - result = FixUpPackedVector256Result(result); - } - - uint mask = TNegator.ExtractMask(result); - - int offsetInVector = 31 - BitOperations.LeadingZeroCount(mask); - if (offsetInVector < Vector256.Count) - { - return offsetInVector; - } - - // We matched within the second vector - return offsetInVector - Vector256.Count + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref secondVector) / (nuint)sizeof(T)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 FixUpPackedVector256Result(Vector256 result) - { - Debug.Assert(Avx2.IsSupported); - // Avx2.PackUnsignedSaturate(Vector256.Create((short)1), Vector256.Create((short)2)) will result in - // 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 - // We want to swap the X and Y bits - // 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2 - return Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); - } - internal interface INegator { static abstract bool NegateIfNeeded(bool result); diff --git a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs index 630170df36d2a..fa1fec132b12e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Packed.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers.Binary; @@ -263,7 +263,10 @@ private static int IndexOf(ref short searchSpace, short value, int len if (result != Vector256.Zero) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + uint notEqualsElements = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte().ExtractMostSignificantBits(); + + int index = BitOperations.TrailingZeroCount(notEqualsElements); + return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / sizeof(short)); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256.Count); @@ -288,7 +291,16 @@ private static int IndexOf(ref short searchSpace, short value, int len if (result != Vector256.Zero) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + uint notEqualsElements = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte().ExtractMostSignificantBits(); + + int offsetInVector = BitOperations.TrailingZeroCount(notEqualsElements); + if (offsetInVector >= Vector256.Count) + { + // We matched within the second vector + firstVector = ref oneVectorAwayFromEnd; + offsetInVector -= Vector256.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref firstVector) / sizeof(short)); } } } @@ -412,7 +424,10 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho if (result != Vector256.Zero) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + uint notEqualsElements = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte().ExtractMostSignificantBits(); + + int index = BitOperations.TrailingZeroCount(notEqualsElements); + return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / sizeof(short)); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256.Count); @@ -437,7 +452,16 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho if (result != Vector256.Zero) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + uint notEqualsElements = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte().ExtractMostSignificantBits(); + + int offsetInVector = BitOperations.TrailingZeroCount(notEqualsElements); + if (offsetInVector >= Vector256.Count) + { + // We matched within the second vector + firstVector = ref oneVectorAwayFromEnd; + offsetInVector -= Vector256.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref firstVector) / sizeof(short)); } } } @@ -564,7 +588,10 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho if (result != Vector256.Zero) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + uint notEqualsElements = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte().ExtractMostSignificantBits(); + + int index = BitOperations.TrailingZeroCount(notEqualsElements); + return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / sizeof(short)); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256.Count); @@ -589,7 +616,16 @@ private static int IndexOfAny(ref short searchSpace, short value0, sho if (result != Vector256.Zero) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + uint notEqualsElements = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte().ExtractMostSignificantBits(); + + int offsetInVector = BitOperations.TrailingZeroCount(notEqualsElements); + if (offsetInVector >= Vector256.Count) + { + // We matched within the second vector + firstVector = ref oneVectorAwayFromEnd; + offsetInVector -= Vector256.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref firstVector) / sizeof(short)); } } } @@ -698,7 +734,10 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI if (result != Vector256.Zero) { - return ComputeFirstIndex(ref searchSpace, ref currentSearchSpace, result); + uint notEqualsElements = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte().ExtractMostSignificantBits(); + + int index = BitOperations.TrailingZeroCount(notEqualsElements); + return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref currentSearchSpace) / sizeof(short)); } currentSearchSpace = ref Unsafe.Add(ref currentSearchSpace, 2 * Vector256.Count); @@ -723,7 +762,16 @@ private static int IndexOfAnyInRange(ref short searchSpace, short lowI if (result != Vector256.Zero) { - return ComputeFirstIndexOverlapped(ref searchSpace, ref firstVector, ref oneVectorAwayFromEnd, result); + uint notEqualsElements = Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte().ExtractMostSignificantBits(); + + int offsetInVector = BitOperations.TrailingZeroCount(notEqualsElements); + if (offsetInVector >= Vector256.Count) + { + // We matched within the second vector + firstVector = ref oneVectorAwayFromEnd; + offsetInVector -= Vector256.Count; + } + return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref firstVector) / sizeof(short)); } } } @@ -819,14 +867,6 @@ private static int ComputeFirstIndex(ref short searchSpace, ref short current, V return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(short)); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int ComputeFirstIndex(ref short searchSpace, ref short current, Vector256 equals) - { - uint notEqualsElements = FixUpPackedVector256Result(equals).ExtractMostSignificantBits(); - int index = BitOperations.TrailingZeroCount(notEqualsElements); - return index + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current) / sizeof(short)); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int ComputeFirstIndexOverlapped(ref short searchSpace, ref short current0, ref short current1, Vector128 equals) { @@ -840,30 +880,5 @@ private static int ComputeFirstIndexOverlapped(ref short searchSpace, ref short } return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / sizeof(short)); } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int ComputeFirstIndexOverlapped(ref short searchSpace, ref short current0, ref short current1, Vector256 equals) - { - uint notEqualsElements = FixUpPackedVector256Result(equals).ExtractMostSignificantBits(); - int offsetInVector = BitOperations.TrailingZeroCount(notEqualsElements); - if (offsetInVector >= Vector256.Count) - { - // We matched within the second vector - current0 = ref current1; - offsetInVector -= Vector256.Count; - } - return offsetInVector + (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref current0) / sizeof(short)); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 FixUpPackedVector256Result(Vector256 result) - { - Debug.Assert(Avx2.IsSupported); - // Avx2.PackUnsignedSaturate(Vector256.Create((short)1), Vector256.Create((short)2)) will result in - // 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 - // We want to swap the X and Y bits - // 1, 1, 1, 1, 1, 1, 1, 1, X, X, X, X, X, X, X, X, Y, Y, Y, Y, Y, Y, Y, Y, 2, 2, 2, 2, 2, 2, 2, 2 - return Avx2.Permute4x64(result.AsInt64(), 0b_11_01_10_00).AsByte(); - } } }