Skip to content
This repository has been archived by the owner on May 10, 2018. It is now read-only.

Superseek #120

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 70 additions & 51 deletions src/Channels/ReadableBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System;
using System.Buffers;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Text;

namespace Channels
Expand All @@ -13,7 +14,16 @@ namespace Channels
/// </summary>
public struct ReadableBuffer
{
private static readonly int VectorWidth = Vector<byte>.Count;
private const ulong xorPowerOfTwoToHighByte = (0x07ul |
0x06ul << 8 |
0x05ul << 16 |
0x04ul << 24 |
0x03ul << 32 |
0x02ul << 40 |
0x01ul << 48 ) + 1;

private const ulong byteBroadcastToUlong = ~0UL / byte.MaxValue;
private const ulong filterByteHighBitsInUlong = (byteBroadcastToUlong >> 1) | (byteBroadcastToUlong << (sizeof(ulong) * 8 - 1));

private Memory<byte> _first;

Expand Down Expand Up @@ -154,8 +164,6 @@ public bool TrySliceTo(byte b1, out ReadableBuffer slice, out ReadCursor cursor)
return false;
}

var byte0Vector = CommonVectors.GetVector(b1);

var seek = 0;

foreach (var span in this)
Expand All @@ -165,19 +173,43 @@ public bool TrySliceTo(byte b1, out ReadableBuffer slice, out ReadCursor cursor)

if (Vector.IsHardwareAccelerated)
{
while (currentSpan.Length >= VectorWidth)
// Search by Vector length (16/32/64 bytes)
while (currentSpan.Length >= Vector<byte>.Count)
{
var data = currentSpan.Read<Vector<byte>>();
var byte0Equals = Vector.Equals(data, byte0Vector);

var byte0Equals = Vector.Equals(data, CommonVectors.GetVector(b1));
if (byte0Equals.Equals(Vector<byte>.Zero))
{
currentSpan = currentSpan.Slice(VectorWidth);
seek += VectorWidth;
currentSpan = currentSpan.Slice(Vector<byte>.Count);
seek += Vector<byte>.Count;
}
else
{
var index = LocateFirstFoundByte(ref byte0Equals);
seek += index;
found = true;
break;
}
}
}

if (!found)
{
// Search by Long length (8 bytes)
while (currentSpan.Length >= sizeof(ulong))
{
var data = currentSpan.Read<ulong>();

var byteEquals = SetLowBitsForByteMatch(data, b1);
if (byteEquals == 0)
{
currentSpan = currentSpan.Slice(sizeof(ulong));
seek += sizeof(ulong);
}
else
{
var index = FindFirstEqualByte(ref byte0Equals);
var index = LocateFirstFoundByte(byteEquals);
seek += index;
found = true;
break;
Expand All @@ -187,7 +219,7 @@ public bool TrySliceTo(byte b1, out ReadableBuffer slice, out ReadCursor cursor)

if (!found)
{
// Slow search
// Byte by byte search
for (int i = 0; i < currentSpan.Length; i++)
{
if (currentSpan[i] == b1)
Expand Down Expand Up @@ -412,60 +444,47 @@ internal void ClearCursors()
}

/// <summary>
/// Find first byte
/// Locate the first of the found bytes
/// </summary>
/// <param name="byteEquals"></param >
/// <returns>The first index of the result vector</returns>
/// <exception cref="InvalidOperationException">byteEquals = 0</exception>
internal static int FindFirstEqualByte(ref Vector<byte> byteEquals)
// Force inlining (64 IL bytes, 91 bytes asm) Issue: https://github.com/dotnet/coreclr/issues/7386
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int LocateFirstFoundByte(ref Vector<byte> byteEquals)
{
if (!BitConverter.IsLittleEndian) return FindFirstEqualByteSlow(ref byteEquals);

// Quasi-tree search
var vector64 = Vector.AsVectorInt64(byteEquals);
for (var i = 0; i < Vector<long>.Count; i++)
long longValue = 0;
var i = 0;
for (; i < Vector<long>.Count; i++)
{
var longValue = vector64[i];
longValue = vector64[i];
if (longValue == 0) continue;

return (i << 3) +
((longValue & 0x00000000ffffffff) > 0
? (longValue & 0x000000000000ffff) > 0
? (longValue & 0x00000000000000ff) > 0 ? 0 : 1
: (longValue & 0x0000000000ff0000) > 0 ? 2 : 3
: (longValue & 0x0000ffff00000000) > 0
? (longValue & 0x000000ff00000000) > 0 ? 4 : 5
: (longValue & 0x00ff000000000000) > 0 ? 6 : 7);
break;
}
throw new InvalidOperationException();

// Single LEA instruction with jitted const (using function result)
return i * 8 + LocateFirstFoundByte(longValue);
}

// Internal for testing
internal static int FindFirstEqualByteSlow(ref Vector<byte> byteEquals)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int LocateFirstFoundByte(long byteEquals)
{
// Quasi-tree search
var vector64 = Vector.AsVectorInt64(byteEquals);
for (var i = 0; i < Vector<long>.Count; i++)
{
var longValue = vector64[i];
if (longValue == 0) continue;
// Flag least significant power of two bit
var powerOfTwoFlag = (ulong)(byteEquals ^ (byteEquals - 1));
// Shift all powers of two into the high byte and extract
return (int)((powerOfTwoFlag * xorPowerOfTwoToHighByte) >> 57);
}

var shift = i << 1;
var offset = shift << 2;
var vector32 = Vector.AsVectorInt32(byteEquals);
if (vector32[shift] != 0)
{
if (byteEquals[offset] != 0) return offset;
if (byteEquals[offset + 1] != 0) return offset + 1;
if (byteEquals[offset + 2] != 0) return offset + 2;
return offset + 3;
}
if (byteEquals[offset + 4] != 0) return offset + 4;
if (byteEquals[offset + 5] != 0) return offset + 5;
if (byteEquals[offset + 6] != 0) return offset + 6;
return offset + 7;
}
throw new InvalidOperationException();
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static long SetLowBitsForByteMatch(ulong ulongValue, byte search)
{
var value = ulongValue ^ (byteBroadcastToUlong * search);
return (long)(
(
(value - byteBroadcastToUlong) &
~(value) &
filterByteHighBitsInUlong
) >> 7);
}

/// <summary>
Expand Down
37 changes: 37 additions & 0 deletions test/Channels.Tests.Performance/Configs.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Jobs;

namespace Channels.Tests.Performance
{
public class DefaultConfig : ManualConfig
{
public DefaultConfig()
{
Add(Job.Default.
With(Platform.X64).
With(Jit.RyuJit).
With(Runtime.Clr).
WithLaunchCount(3).
WithIterationTime(200). // 200ms per iteration
WithWarmupCount(5).
WithTargetCount(10));

Add(new BenchmarkDotNet.Diagnostics.Windows.MemoryDiagnoser());
}
}

public class NoMemoryConfig : ManualConfig
{
public NoMemoryConfig()
{
Add(Job.Default.
With(Platform.X64).
With(Jit.RyuJit).
With(Runtime.Clr).
WithLaunchCount(3).
WithIterationTime(200). // 200ms per iteration
WithWarmupCount(5).
WithTargetCount(10));
}
}
}
24 changes: 6 additions & 18 deletions test/Channels.Tests.Performance/Program.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using System;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Diagnostics.Windows;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;

Expand Down Expand Up @@ -32,33 +31,22 @@ private static void RunSelectedBenchmarks(BenchmarkType type)
{
BenchmarkRunner.Run<ChannelsStreamsBenchmark>();
}

if (type.HasFlag(BenchmarkType.TrySliceTo))
{
BenchmarkRunner.Run<TrySliceToBenchmark>();
}
}
}

[Flags]
public enum BenchmarkType : uint
{
Streams = 1,
TrySliceTo = 2,
// add new ones in powers of two - e.g. 2,4,8,16...

All = uint.MaxValue
}

public class DefaultConfig : ManualConfig
{
public DefaultConfig()
{
Add(Job.Default.
With(Platform.X64).
With(Jit.RyuJit).
With(Runtime.Clr).
WithLaunchCount(3).
WithIterationTime(200). // 200ms per iteration
WithWarmupCount(5).
WithTargetCount(10));

Add(new MemoryDiagnoser());
}
}
}

Loading