Skip to content

Commit 641e08f

Browse files
benaadamstannergooding
authored andcommitted
Intrinsicify SpanHelpers.IndexOf(char) (dotnet/coreclr#22505)
* Helpers to support Intrinsics in SpanHelpers.Char * Intrinsicify SpanHelpers.IndexOf(char) * Feedback * fix * Fix assert * Improve comment warning * fix * fix * Fix * Fix Commit migrated from dotnet/coreclr@5676801
1 parent a0c6242 commit 641e08f

File tree

2 files changed

+279
-160
lines changed

2 files changed

+279
-160
lines changed

src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Char.cs

+273-63
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,17 @@
55
using System.Diagnostics;
66
using System.Numerics;
77
using System.Runtime.CompilerServices;
8+
using System.Runtime.Intrinsics;
89
using System.Runtime.Intrinsics.X86;
910

1011
using Internal.Runtime.CompilerServices;
1112

1213
#if BIT64
1314
using nuint = System.UInt64;
15+
using nint = System.Int64;
1416
#else
1517
using nuint = System.UInt32;
18+
using nint = System.Int32;
1619
#endif
1720

1821
namespace System
@@ -218,93 +221,243 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
218221
{
219222
Debug.Assert(length >= 0);
220223

221-
fixed (char* pChars = &searchSpace)
222-
{
223-
char* pCh = pChars;
224-
char* pEndCh = pCh + length;
224+
nint offset = 0;
225+
nint lengthToExamine = length;
225226

226-
if (Vector.IsHardwareAccelerated && length >= Vector<ushort>.Count * 2)
227+
if (((int)Unsafe.AsPointer(ref searchSpace) & 1) != 0)
228+
{
229+
// Input isn't char aligned, we won't be able to align it to a Vector
230+
}
231+
else if (Sse2.IsSupported)
232+
{
233+
// Avx2 branch also operates on Sse2 sizes, so check is combined.
234+
// Needs to be double length to allow us to align the data first.
235+
if (length >= Vector128<ushort>.Count * 2)
227236
{
228-
// Figure out how many characters to read sequentially until we are vector aligned
229-
// This is equivalent to:
230-
// unaligned = ((int)pCh % Unsafe.SizeOf<Vector<ushort>>()) / elementsPerByte
231-
// length = (Vector<ushort>.Count - unaligned) % Vector<ushort>.Count
232-
const int elementsPerByte = sizeof(ushort) / sizeof(byte);
233-
int unaligned = ((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) / elementsPerByte;
234-
length = (Vector<ushort>.Count - unaligned) & (Vector<ushort>.Count - 1);
237+
lengthToExamine = UnalignedCountVector128(ref searchSpace);
235238
}
236-
237-
SequentialScan:
238-
while (length >= 4)
239+
}
240+
else if (Vector.IsHardwareAccelerated)
241+
{
242+
// Needs to be double length to allow us to align the data first.
243+
if (length >= Vector<ushort>.Count * 2)
239244
{
240-
length -= 4;
245+
lengthToExamine = UnalignedCountVector(ref searchSpace);
246+
}
247+
}
241248

242-
if (pCh[0] == value)
243-
goto Found;
244-
if (pCh[1] == value)
245-
goto Found1;
246-
if (pCh[2] == value)
247-
goto Found2;
248-
if (pCh[3] == value)
249-
goto Found3;
249+
SequentialScan:
250+
// In the non-vector case lengthToExamine is the total length.
251+
// In the vector case lengthToExamine first aligns to Vector,
252+
// then in a second pass after the Vector lengths is the
253+
// remaining data that is shorter than a Vector length.
254+
while (lengthToExamine >= 4)
255+
{
256+
ref char current = ref Add(ref searchSpace, offset);
257+
258+
if (value == current)
259+
goto Found;
260+
if (value == Add(ref current, 1))
261+
goto Found1;
262+
if (value == Add(ref current, 2))
263+
goto Found2;
264+
if (value == Add(ref current, 3))
265+
goto Found3;
266+
267+
offset += 4;
268+
lengthToExamine -= 4;
269+
}
250270

251-
pCh += 4;
252-
}
271+
while (lengthToExamine > 0)
272+
{
273+
if (value == Add(ref searchSpace, offset))
274+
goto Found;
253275

254-
while (length > 0)
276+
offset += 1;
277+
lengthToExamine -= 1;
278+
}
279+
280+
// We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow
281+
// the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
282+
if (Avx2.IsSupported)
283+
{
284+
if (offset < length)
255285
{
256-
length--;
286+
Debug.Assert(length - offset >= Vector128<ushort>.Count);
287+
if (((nint)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, (IntPtr)offset)) & (nint)(Vector256<byte>.Count - 1)) != 0)
288+
{
289+
// Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches
290+
// with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256,
291+
// before moving to processing Vector256.
292+
293+
// If the input searchSpan has been fixed or pinned, this ensures we do not fault across memory pages
294+
// while searching for an end of string. Specifically that this assumes that the length is either correct
295+
// or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an
296+
// unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found,
297+
// again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather
298+
// than ever causing an AV.
299+
300+
// If the searchSpan has not been fixed or pinned the GC can relocate it during the execution of this
301+
// method, so the alignment only acts as best endeavour. The GC cost is likely to dominate over
302+
// the misalignment that may occur after; to we default to giving the GC a free hand to relocate and
303+
// its up to the caller whether they are operating over fixed data.
304+
Vector128<ushort> values = Vector128.Create((ushort)value);
305+
Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
306+
307+
// Same method as below
308+
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
309+
if (matches == 0)
310+
{
311+
// Zero flags set so no matches
312+
offset += Vector128<ushort>.Count;
313+
}
314+
else
315+
{
316+
// Find bitflag offset of first match and add to current offset
317+
return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
318+
}
319+
}
257320

258-
if (pCh[0] == value)
259-
goto Found;
321+
lengthToExamine = GetCharVector256SpanLength(offset, length);
322+
if (lengthToExamine > 0)
323+
{
324+
Vector256<ushort> values = Vector256.Create((ushort)value);
325+
do
326+
{
327+
Debug.Assert(lengthToExamine >= Vector256<ushort>.Count);
328+
329+
Vector256<ushort> search = LoadVector256(ref searchSpace, offset);
330+
int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search).AsByte());
331+
// Note that MoveMask has converted the equal vector elements into a set of bit flags,
332+
// So the bit position in 'matches' corresponds to the element offset.
333+
if (matches == 0)
334+
{
335+
// Zero flags set so no matches
336+
offset += Vector256<ushort>.Count;
337+
lengthToExamine -= Vector256<ushort>.Count;
338+
continue;
339+
}
340+
341+
// Find bitflag offset of first match and add to current offset,
342+
// flags are in bytes so divide for chars
343+
return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
344+
} while (lengthToExamine > 0);
345+
}
260346

261-
pCh++;
347+
lengthToExamine = GetCharVector128SpanLength(offset, length);
348+
if (lengthToExamine > 0)
349+
{
350+
Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);
351+
352+
Vector128<ushort> values = Vector128.Create((ushort)value);
353+
Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
354+
355+
// Same method as above
356+
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
357+
if (matches == 0)
358+
{
359+
// Zero flags set so no matches
360+
offset += Vector128<ushort>.Count;
361+
// Don't need to change lengthToExamine here as we don't use its current value again.
362+
}
363+
else
364+
{
365+
// Find bitflag offset of first match and add to current offset,
366+
// flags are in bytes so divide for chars
367+
return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
368+
}
369+
}
370+
371+
if (offset < length)
372+
{
373+
lengthToExamine = length - offset;
374+
goto SequentialScan;
375+
}
262376
}
377+
}
378+
else if (Sse2.IsSupported)
379+
{
380+
if (offset < length)
381+
{
382+
Debug.Assert(length - offset >= Vector128<ushort>.Count);
263383

264-
// We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow
265-
// the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated.
266-
if (Vector.IsHardwareAccelerated && pCh < pEndCh)
384+
lengthToExamine = GetCharVector128SpanLength(offset, length);
385+
if (lengthToExamine > 0)
386+
{
387+
Vector128<ushort> values = Vector128.Create((ushort)value);
388+
do
389+
{
390+
Debug.Assert(lengthToExamine >= Vector128<ushort>.Count);
391+
392+
Vector128<ushort> search = LoadVector128(ref searchSpace, offset);
393+
394+
// Same method as above
395+
int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte());
396+
if (matches == 0)
397+
{
398+
// Zero flags set so no matches
399+
offset += Vector128<ushort>.Count;
400+
lengthToExamine -= Vector128<ushort>.Count;
401+
continue;
402+
}
403+
404+
// Find bitflag offset of first match and add to current offset,
405+
// flags are in bytes so divide for chars
406+
return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char)));
407+
} while (lengthToExamine > 0);
408+
}
409+
410+
if (offset < length)
411+
{
412+
lengthToExamine = length - offset;
413+
goto SequentialScan;
414+
}
415+
}
416+
}
417+
else if (Vector.IsHardwareAccelerated)
418+
{
419+
if (offset < length)
267420
{
268-
// Get the highest multiple of Vector<ushort>.Count that is within the search space.
269-
// That will be how many times we iterate in the loop below.
270-
// This is equivalent to: length = Vector<ushort>.Count * ((int)(pEndCh - pCh) / Vector<ushort>.Count)
271-
length = (int)((pEndCh - pCh) & ~(Vector<ushort>.Count - 1));
421+
Debug.Assert(length - offset >= Vector<ushort>.Count);
272422

273-
// Get comparison Vector
274-
Vector<ushort> vComparison = new Vector<ushort>(value);
423+
lengthToExamine = GetCharVectorSpanLength(offset, length);
275424

276-
while (length > 0)
425+
if (lengthToExamine > 0)
277426
{
278-
// Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned
279-
Debug.Assert(((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) == 0);
280-
Vector<ushort> vMatches = Vector.Equals(vComparison, Unsafe.Read<Vector<ushort>>(pCh));
281-
if (Vector<ushort>.Zero.Equals(vMatches))
427+
Vector<ushort> values = new Vector<ushort>((ushort)value);
428+
do
282429
{
283-
pCh += Vector<ushort>.Count;
284-
length -= Vector<ushort>.Count;
285-
continue;
286-
}
287-
// Find offset of first match
288-
return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches);
430+
Debug.Assert(lengthToExamine >= Vector<ushort>.Count);
431+
432+
var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset));
433+
if (Vector<ushort>.Zero.Equals(matches))
434+
{
435+
offset += Vector<ushort>.Count;
436+
lengthToExamine -= Vector<ushort>.Count;
437+
continue;
438+
}
439+
440+
// Find offset of first match
441+
return (int)(offset + LocateFirstFoundChar(matches));
442+
} while (lengthToExamine > 0);
289443
}
290444

291-
if (pCh < pEndCh)
445+
if (offset < length)
292446
{
293-
length = (int)(pEndCh - pCh);
447+
lengthToExamine = length - offset;
294448
goto SequentialScan;
295449
}
296450
}
297-
298-
return -1;
299-
Found3:
300-
pCh++;
301-
Found2:
302-
pCh++;
303-
Found1:
304-
pCh++;
305-
Found:
306-
return (int)(pCh - pChars);
307451
}
452+
return -1;
453+
Found3:
454+
return (int)(offset + 3);
455+
Found2:
456+
return (int)(offset + 2);
457+
Found1:
458+
return (int)(offset + 1);
459+
Found:
460+
return (int)(offset);
308461
}
309462

310463
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
@@ -876,5 +1029,62 @@ private static int LocateLastFoundChar(ulong match)
8761029
{
8771030
return 3 - (BitOperations.LeadingZeroCount(match) >> 4);
8781031
}
1032+
1033+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1034+
public static ref char Add(ref char source, nint elementOffset)
1035+
=> ref Unsafe.Add(ref source, (IntPtr)elementOffset);
1036+
1037+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1038+
private static unsafe Vector<ushort> LoadVector(ref char start, nint offset)
1039+
=> Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
1040+
1041+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1042+
private static unsafe Vector128<ushort> LoadVector128(ref char start, nint offset)
1043+
=> Unsafe.ReadUnaligned<Vector128<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
1044+
1045+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1046+
private static unsafe Vector256<ushort> LoadVector256(ref char start, nint offset)
1047+
=> Unsafe.ReadUnaligned<Vector256<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
1048+
1049+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1050+
private static unsafe UIntPtr LoadUIntPtr(ref char start, nint offset)
1051+
=> Unsafe.ReadUnaligned<UIntPtr>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset)));
1052+
1053+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1054+
private static unsafe nint GetCharVectorSpanLength(nint offset, nint length)
1055+
=> ((length - offset) & ~(Vector<ushort>.Count - 1));
1056+
1057+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1058+
private static unsafe nint GetCharVector128SpanLength(nint offset, nint length)
1059+
=> ((length - offset) & ~(Vector128<ushort>.Count - 1));
1060+
1061+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1062+
private static nint GetCharVector256SpanLength(nint offset, nint length)
1063+
=> ((length - offset) & ~(Vector256<ushort>.Count - 1));
1064+
1065+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1066+
private static unsafe nint UnalignedCountVector(ref char searchSpace)
1067+
{
1068+
const int ElementsPerByte = sizeof(ushort) / sizeof(byte);
1069+
// Figure out how many characters to read sequentially until we are vector aligned
1070+
// This is equivalent to:
1071+
// unaligned = ((int)pCh % Unsafe.SizeOf<Vector<ushort>>()) / ElementsPerByte
1072+
// length = (Vector<ushort>.Count - unaligned) % Vector<ushort>.Count
1073+
1074+
// This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data.
1075+
// If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it
1076+
// isn't too important to pin to maintain the alignment.
1077+
return (nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte ) & (Vector<ushort>.Count - 1);
1078+
}
1079+
1080+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
1081+
private static unsafe nint UnalignedCountVector128(ref char searchSpace)
1082+
{
1083+
const int ElementsPerByte = sizeof(ushort) / sizeof(byte);
1084+
// This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data.
1085+
// If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it
1086+
// isn't too important to pin to maintain the alignment.
1087+
return (nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte ) & (Vector128<ushort>.Count - 1);
1088+
}
8791089
}
8801090
}

0 commit comments

Comments
 (0)