|
5 | 5 | using System.Diagnostics;
|
6 | 6 | using System.Numerics;
|
7 | 7 | using System.Runtime.CompilerServices;
|
| 8 | +using System.Runtime.Intrinsics; |
8 | 9 | using System.Runtime.Intrinsics.X86;
|
9 | 10 |
|
10 | 11 | using Internal.Runtime.CompilerServices;
|
11 | 12 |
|
12 | 13 | #if BIT64
|
13 | 14 | using nuint = System.UInt64;
|
| 15 | +using nint = System.Int64; |
14 | 16 | #else
|
15 | 17 | using nuint = System.UInt32;
|
| 18 | +using nint = System.Int32; |
16 | 19 | #endif
|
17 | 20 |
|
18 | 21 | namespace System
|
@@ -218,93 +221,243 @@ public static unsafe int IndexOf(ref char searchSpace, char value, int length)
|
218 | 221 | {
|
219 | 222 | Debug.Assert(length >= 0);
|
220 | 223 |
|
221 |
| - fixed (char* pChars = &searchSpace) |
222 |
| - { |
223 |
| - char* pCh = pChars; |
224 |
| - char* pEndCh = pCh + length; |
| 224 | + nint offset = 0; |
| 225 | + nint lengthToExamine = length; |
225 | 226 |
|
226 |
| - if (Vector.IsHardwareAccelerated && length >= Vector<ushort>.Count * 2) |
| 227 | + if (((int)Unsafe.AsPointer(ref searchSpace) & 1) != 0) |
| 228 | + { |
| 229 | + // Input isn't char aligned, we won't be able to align it to a Vector |
| 230 | + } |
| 231 | + else if (Sse2.IsSupported) |
| 232 | + { |
| 233 | + // Avx2 branch also operates on Sse2 sizes, so check is combined. |
| 234 | + // Needs to be double length to allow us to align the data first. |
| 235 | + if (length >= Vector128<ushort>.Count * 2) |
227 | 236 | {
|
228 |
| - // Figure out how many characters to read sequentially until we are vector aligned |
229 |
| - // This is equivalent to: |
230 |
| - // unaligned = ((int)pCh % Unsafe.SizeOf<Vector<ushort>>()) / elementsPerByte |
231 |
| - // length = (Vector<ushort>.Count - unaligned) % Vector<ushort>.Count |
232 |
| - const int elementsPerByte = sizeof(ushort) / sizeof(byte); |
233 |
| - int unaligned = ((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) / elementsPerByte; |
234 |
| - length = (Vector<ushort>.Count - unaligned) & (Vector<ushort>.Count - 1); |
| 237 | + lengthToExamine = UnalignedCountVector128(ref searchSpace); |
235 | 238 | }
|
236 |
| - |
237 |
| - SequentialScan: |
238 |
| - while (length >= 4) |
| 239 | + } |
| 240 | + else if (Vector.IsHardwareAccelerated) |
| 241 | + { |
| 242 | + // Needs to be double length to allow us to align the data first. |
| 243 | + if (length >= Vector<ushort>.Count * 2) |
239 | 244 | {
|
240 |
| - length -= 4; |
| 245 | + lengthToExamine = UnalignedCountVector(ref searchSpace); |
| 246 | + } |
| 247 | + } |
241 | 248 |
|
242 |
| - if (pCh[0] == value) |
243 |
| - goto Found; |
244 |
| - if (pCh[1] == value) |
245 |
| - goto Found1; |
246 |
| - if (pCh[2] == value) |
247 |
| - goto Found2; |
248 |
| - if (pCh[3] == value) |
249 |
| - goto Found3; |
| 249 | + SequentialScan: |
| 250 | + // In the non-vector case lengthToExamine is the total length. |
| 251 | + // In the vector case lengthToExamine first aligns to Vector, |
| 252 | + // then in a second pass after the Vector lengths is the |
| 253 | + // remaining data that is shorter than a Vector length. |
| 254 | + while (lengthToExamine >= 4) |
| 255 | + { |
| 256 | + ref char current = ref Add(ref searchSpace, offset); |
| 257 | + |
| 258 | + if (value == current) |
| 259 | + goto Found; |
| 260 | + if (value == Add(ref current, 1)) |
| 261 | + goto Found1; |
| 262 | + if (value == Add(ref current, 2)) |
| 263 | + goto Found2; |
| 264 | + if (value == Add(ref current, 3)) |
| 265 | + goto Found3; |
| 266 | + |
| 267 | + offset += 4; |
| 268 | + lengthToExamine -= 4; |
| 269 | + } |
250 | 270 |
|
251 |
| - pCh += 4; |
252 |
| - } |
| 271 | + while (lengthToExamine > 0) |
| 272 | + { |
| 273 | + if (value == Add(ref searchSpace, offset)) |
| 274 | + goto Found; |
253 | 275 |
|
254 |
| - while (length > 0) |
| 276 | + offset += 1; |
| 277 | + lengthToExamine -= 1; |
| 278 | + } |
| 279 | + |
| 280 | + // We get past SequentialScan only if IsHardwareAccelerated or intrinsic .IsSupported is true. However, we still have the redundant check to allow |
| 281 | + // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. |
| 282 | + if (Avx2.IsSupported) |
| 283 | + { |
| 284 | + if (offset < length) |
255 | 285 | {
|
256 |
| - length--; |
| 286 | + Debug.Assert(length - offset >= Vector128<ushort>.Count); |
| 287 | + if (((nint)Unsafe.AsPointer(ref Unsafe.Add(ref searchSpace, (IntPtr)offset)) & (nint)(Vector256<byte>.Count - 1)) != 0) |
| 288 | + { |
| 289 | + // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches |
| 290 | + // with no upper bound e.g. String.wcslen. Start with a check on Vector128 to align to Vector256, |
| 291 | + // before moving to processing Vector256. |
| 292 | + |
| 293 | + // If the input searchSpan has been fixed or pinned, this ensures we do not fault across memory pages |
| 294 | + // while searching for an end of string. Specifically that this assumes that the length is either correct |
| 295 | + // or that the data is pinned otherwise it may cause an AccessViolation from crossing a page boundary into an |
| 296 | + // unowned page. If the search is unbounded (e.g. null terminator in wcslen) and the search value is not found, |
| 297 | + // again this will likely cause an AccessViolation. However, correctly bounded searches will return -1 rather |
| 298 | + // than ever causing an AV. |
| 299 | + |
| 300 | + // If the searchSpan has not been fixed or pinned the GC can relocate it during the execution of this |
| 301 | + // method, so the alignment only acts as best endeavour. The GC cost is likely to dominate over |
| 302 | + // the misalignment that may occur after; to we default to giving the GC a free hand to relocate and |
| 303 | + // its up to the caller whether they are operating over fixed data. |
| 304 | + Vector128<ushort> values = Vector128.Create((ushort)value); |
| 305 | + Vector128<ushort> search = LoadVector128(ref searchSpace, offset); |
| 306 | + |
| 307 | + // Same method as below |
| 308 | + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); |
| 309 | + if (matches == 0) |
| 310 | + { |
| 311 | + // Zero flags set so no matches |
| 312 | + offset += Vector128<ushort>.Count; |
| 313 | + } |
| 314 | + else |
| 315 | + { |
| 316 | + // Find bitflag offset of first match and add to current offset |
| 317 | + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); |
| 318 | + } |
| 319 | + } |
257 | 320 |
|
258 |
| - if (pCh[0] == value) |
259 |
| - goto Found; |
| 321 | + lengthToExamine = GetCharVector256SpanLength(offset, length); |
| 322 | + if (lengthToExamine > 0) |
| 323 | + { |
| 324 | + Vector256<ushort> values = Vector256.Create((ushort)value); |
| 325 | + do |
| 326 | + { |
| 327 | + Debug.Assert(lengthToExamine >= Vector256<ushort>.Count); |
| 328 | + |
| 329 | + Vector256<ushort> search = LoadVector256(ref searchSpace, offset); |
| 330 | + int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search).AsByte()); |
| 331 | + // Note that MoveMask has converted the equal vector elements into a set of bit flags, |
| 332 | + // So the bit position in 'matches' corresponds to the element offset. |
| 333 | + if (matches == 0) |
| 334 | + { |
| 335 | + // Zero flags set so no matches |
| 336 | + offset += Vector256<ushort>.Count; |
| 337 | + lengthToExamine -= Vector256<ushort>.Count; |
| 338 | + continue; |
| 339 | + } |
| 340 | + |
| 341 | + // Find bitflag offset of first match and add to current offset, |
| 342 | + // flags are in bytes so divide for chars |
| 343 | + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); |
| 344 | + } while (lengthToExamine > 0); |
| 345 | + } |
260 | 346 |
|
261 |
| - pCh++; |
| 347 | + lengthToExamine = GetCharVector128SpanLength(offset, length); |
| 348 | + if (lengthToExamine > 0) |
| 349 | + { |
| 350 | + Debug.Assert(lengthToExamine >= Vector128<ushort>.Count); |
| 351 | + |
| 352 | + Vector128<ushort> values = Vector128.Create((ushort)value); |
| 353 | + Vector128<ushort> search = LoadVector128(ref searchSpace, offset); |
| 354 | + |
| 355 | + // Same method as above |
| 356 | + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); |
| 357 | + if (matches == 0) |
| 358 | + { |
| 359 | + // Zero flags set so no matches |
| 360 | + offset += Vector128<ushort>.Count; |
| 361 | + // Don't need to change lengthToExamine here as we don't use its current value again. |
| 362 | + } |
| 363 | + else |
| 364 | + { |
| 365 | + // Find bitflag offset of first match and add to current offset, |
| 366 | + // flags are in bytes so divide for chars |
| 367 | + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); |
| 368 | + } |
| 369 | + } |
| 370 | + |
| 371 | + if (offset < length) |
| 372 | + { |
| 373 | + lengthToExamine = length - offset; |
| 374 | + goto SequentialScan; |
| 375 | + } |
262 | 376 | }
|
| 377 | + } |
| 378 | + else if (Sse2.IsSupported) |
| 379 | + { |
| 380 | + if (offset < length) |
| 381 | + { |
| 382 | + Debug.Assert(length - offset >= Vector128<ushort>.Count); |
263 | 383 |
|
264 |
| - // We get past SequentialScan only if IsHardwareAccelerated is true. However, we still have the redundant check to allow |
265 |
| - // the JIT to see that the code is unreachable and eliminate it when the platform does not have hardware accelerated. |
266 |
| - if (Vector.IsHardwareAccelerated && pCh < pEndCh) |
| 384 | + lengthToExamine = GetCharVector128SpanLength(offset, length); |
| 385 | + if (lengthToExamine > 0) |
| 386 | + { |
| 387 | + Vector128<ushort> values = Vector128.Create((ushort)value); |
| 388 | + do |
| 389 | + { |
| 390 | + Debug.Assert(lengthToExamine >= Vector128<ushort>.Count); |
| 391 | + |
| 392 | + Vector128<ushort> search = LoadVector128(ref searchSpace, offset); |
| 393 | + |
| 394 | + // Same method as above |
| 395 | + int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search).AsByte()); |
| 396 | + if (matches == 0) |
| 397 | + { |
| 398 | + // Zero flags set so no matches |
| 399 | + offset += Vector128<ushort>.Count; |
| 400 | + lengthToExamine -= Vector128<ushort>.Count; |
| 401 | + continue; |
| 402 | + } |
| 403 | + |
| 404 | + // Find bitflag offset of first match and add to current offset, |
| 405 | + // flags are in bytes so divide for chars |
| 406 | + return (int)(offset + (BitOperations.TrailingZeroCount(matches) / sizeof(char))); |
| 407 | + } while (lengthToExamine > 0); |
| 408 | + } |
| 409 | + |
| 410 | + if (offset < length) |
| 411 | + { |
| 412 | + lengthToExamine = length - offset; |
| 413 | + goto SequentialScan; |
| 414 | + } |
| 415 | + } |
| 416 | + } |
| 417 | + else if (Vector.IsHardwareAccelerated) |
| 418 | + { |
| 419 | + if (offset < length) |
267 | 420 | {
|
268 |
| - // Get the highest multiple of Vector<ushort>.Count that is within the search space. |
269 |
| - // That will be how many times we iterate in the loop below. |
270 |
| - // This is equivalent to: length = Vector<ushort>.Count * ((int)(pEndCh - pCh) / Vector<ushort>.Count) |
271 |
| - length = (int)((pEndCh - pCh) & ~(Vector<ushort>.Count - 1)); |
| 421 | + Debug.Assert(length - offset >= Vector<ushort>.Count); |
272 | 422 |
|
273 |
| - // Get comparison Vector |
274 |
| - Vector<ushort> vComparison = new Vector<ushort>(value); |
| 423 | + lengthToExamine = GetCharVectorSpanLength(offset, length); |
275 | 424 |
|
276 |
| - while (length > 0) |
| 425 | + if (lengthToExamine > 0) |
277 | 426 | {
|
278 |
| - // Using Unsafe.Read instead of ReadUnaligned since the search space is pinned and pCh is always vector aligned |
279 |
| - Debug.Assert(((int)pCh & (Unsafe.SizeOf<Vector<ushort>>() - 1)) == 0); |
280 |
| - Vector<ushort> vMatches = Vector.Equals(vComparison, Unsafe.Read<Vector<ushort>>(pCh)); |
281 |
| - if (Vector<ushort>.Zero.Equals(vMatches)) |
| 427 | + Vector<ushort> values = new Vector<ushort>((ushort)value); |
| 428 | + do |
282 | 429 | {
|
283 |
| - pCh += Vector<ushort>.Count; |
284 |
| - length -= Vector<ushort>.Count; |
285 |
| - continue; |
286 |
| - } |
287 |
| - // Find offset of first match |
288 |
| - return (int)(pCh - pChars) + LocateFirstFoundChar(vMatches); |
| 430 | + Debug.Assert(lengthToExamine >= Vector<ushort>.Count); |
| 431 | + |
| 432 | + var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); |
| 433 | + if (Vector<ushort>.Zero.Equals(matches)) |
| 434 | + { |
| 435 | + offset += Vector<ushort>.Count; |
| 436 | + lengthToExamine -= Vector<ushort>.Count; |
| 437 | + continue; |
| 438 | + } |
| 439 | + |
| 440 | + // Find offset of first match |
| 441 | + return (int)(offset + LocateFirstFoundChar(matches)); |
| 442 | + } while (lengthToExamine > 0); |
289 | 443 | }
|
290 | 444 |
|
291 |
| - if (pCh < pEndCh) |
| 445 | + if (offset < length) |
292 | 446 | {
|
293 |
| - length = (int)(pEndCh - pCh); |
| 447 | + lengthToExamine = length - offset; |
294 | 448 | goto SequentialScan;
|
295 | 449 | }
|
296 | 450 | }
|
297 |
| - |
298 |
| - return -1; |
299 |
| - Found3: |
300 |
| - pCh++; |
301 |
| - Found2: |
302 |
| - pCh++; |
303 |
| - Found1: |
304 |
| - pCh++; |
305 |
| - Found: |
306 |
| - return (int)(pCh - pChars); |
307 | 451 | }
|
| 452 | + return -1; |
| 453 | + Found3: |
| 454 | + return (int)(offset + 3); |
| 455 | + Found2: |
| 456 | + return (int)(offset + 2); |
| 457 | + Found1: |
| 458 | + return (int)(offset + 1); |
| 459 | + Found: |
| 460 | + return (int)(offset); |
308 | 461 | }
|
309 | 462 |
|
310 | 463 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
|
@@ -876,5 +1029,62 @@ private static int LocateLastFoundChar(ulong match)
|
876 | 1029 | {
|
877 | 1030 | return 3 - (BitOperations.LeadingZeroCount(match) >> 4);
|
878 | 1031 | }
|
| 1032 | + |
| 1033 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1034 | + public static ref char Add(ref char source, nint elementOffset) |
| 1035 | + => ref Unsafe.Add(ref source, (IntPtr)elementOffset); |
| 1036 | + |
| 1037 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1038 | + private static unsafe Vector<ushort> LoadVector(ref char start, nint offset) |
| 1039 | + => Unsafe.ReadUnaligned<Vector<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset))); |
| 1040 | + |
| 1041 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1042 | + private static unsafe Vector128<ushort> LoadVector128(ref char start, nint offset) |
| 1043 | + => Unsafe.ReadUnaligned<Vector128<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset))); |
| 1044 | + |
| 1045 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1046 | + private static unsafe Vector256<ushort> LoadVector256(ref char start, nint offset) |
| 1047 | + => Unsafe.ReadUnaligned<Vector256<ushort>>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset))); |
| 1048 | + |
| 1049 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1050 | + private static unsafe UIntPtr LoadUIntPtr(ref char start, nint offset) |
| 1051 | + => Unsafe.ReadUnaligned<UIntPtr>(ref Unsafe.As<char, byte>(ref Unsafe.Add(ref start, (IntPtr)offset))); |
| 1052 | + |
| 1053 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1054 | + private static unsafe nint GetCharVectorSpanLength(nint offset, nint length) |
| 1055 | + => ((length - offset) & ~(Vector<ushort>.Count - 1)); |
| 1056 | + |
| 1057 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1058 | + private static unsafe nint GetCharVector128SpanLength(nint offset, nint length) |
| 1059 | + => ((length - offset) & ~(Vector128<ushort>.Count - 1)); |
| 1060 | + |
| 1061 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1062 | + private static nint GetCharVector256SpanLength(nint offset, nint length) |
| 1063 | + => ((length - offset) & ~(Vector256<ushort>.Count - 1)); |
| 1064 | + |
| 1065 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1066 | + private static unsafe nint UnalignedCountVector(ref char searchSpace) |
| 1067 | + { |
| 1068 | + const int ElementsPerByte = sizeof(ushort) / sizeof(byte); |
| 1069 | + // Figure out how many characters to read sequentially until we are vector aligned |
| 1070 | + // This is equivalent to: |
| 1071 | + // unaligned = ((int)pCh % Unsafe.SizeOf<Vector<ushort>>()) / ElementsPerByte |
| 1072 | + // length = (Vector<ushort>.Count - unaligned) % Vector<ushort>.Count |
| 1073 | + |
| 1074 | + // This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data. |
| 1075 | + // If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it |
| 1076 | + // isn't too important to pin to maintain the alignment. |
| 1077 | + return (nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte ) & (Vector<ushort>.Count - 1); |
| 1078 | + } |
| 1079 | + |
| 1080 | + [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| 1081 | + private static unsafe nint UnalignedCountVector128(ref char searchSpace) |
| 1082 | + { |
| 1083 | + const int ElementsPerByte = sizeof(ushort) / sizeof(byte); |
| 1084 | + // This alignment is only valid if the GC does not relocate; so we use ReadUnaligned to get the data. |
| 1085 | + // If a GC does occur and alignment is lost, the GC cost will outweigh any gains from alignment so it |
| 1086 | + // isn't too important to pin to maintain the alignment. |
| 1087 | + return (nint)(uint)(-(int)Unsafe.AsPointer(ref searchSpace) / ElementsPerByte ) & (Vector128<ushort>.Count - 1); |
| 1088 | + } |
879 | 1089 | }
|
880 | 1090 | }
|
0 commit comments