Skip to content

Commit

Permalink
One more iteration
Browse files Browse the repository at this point in the history
  • Loading branch information
aalmada committed Apr 26, 2024
1 parent 2859001 commit d4c13b0
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 45 deletions.
36 changes: 21 additions & 15 deletions src/NetFabric.Numerics.Tensors/ApplyBinary.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System;

namespace NetFabric.Numerics.Tensors;

public static partial class Tensor
Expand Down Expand Up @@ -35,28 +37,32 @@ public static void Apply<T1, T2, TResult, TOperator>(ReadOnlyMemory<T1> x, ReadO
if (x.Length > destination.Length)
Throw.ArgumentException(nameof(destination), "Destination span is too small.");

if(x.Length > 2 * minChunkSize)
ParallelApply(x, y, destination);
var coreCount = AvailableCores();

if (coreCount >= minChunkCount && x.Length > minChunkCount * minChunkSize)
ParallelApply(x, y, destination, coreCount);
else
Apply<T1, T2, TResult, TOperator>(x.Span, y.Span, destination.Span);

static void ParallelApply(ReadOnlyMemory<T1> x, ReadOnlyMemory<T2> y, Memory<TResult> destination)
static void ParallelApply(ReadOnlyMemory<T1> x, ReadOnlyMemory<T2> y, Memory<TResult> destination, int coreCount)
{
var size = x.Length;
var chunkSize = int.Max(size / AvailableCores(), minChunkSize);
var totalSize = x.Length;
var chunkSize = int.Max(totalSize / coreCount, minChunkSize);

var actions = new Action[size / chunkSize];
var actions = new Action[totalSize / chunkSize];
var start = 0;
for (var index = 0; index < actions.Length; index++)
{
var start = index * chunkSize;
var length = (index == actions.Length - 1)
? size - start
var length = (index == actions.Length - 1)
? totalSize - start
: chunkSize;

var xSlice = x.Slice(start, length);
var ySlice = y.Slice(start, length);
var destinationSlice = destination.Slice(start, length);
actions[index] = () => Apply<T1, T2, TResult, TOperator>(xSlice.Span, ySlice.Span, destinationSlice.Span);

start += length;
}
Parallel.Invoke(actions);
}
Expand Down Expand Up @@ -111,13 +117,13 @@ public static void Apply<T1, T2, TResult, TOperator>(ReadOnlySpan<T1> x, ReadOnl
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Vector<T2>.IsSupported &&
Vector<TResult>.IsSupported &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down Expand Up @@ -220,13 +226,13 @@ public static void Apply<T1, T2, TResult, TOperator>(ReadOnlySpan<T1> x, T2 y, S
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Vector<T2>.IsSupported &&
Vector<TResult>.IsSupported &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down Expand Up @@ -328,15 +334,15 @@ public static void Apply<T1, T2, TResult, TOperator>(ReadOnlySpan<T1> x, (T2, T2
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Vector<T2>.IsSupported &&
Vector<TResult>.IsSupported &&
Vector<T1>.Count > 2 &&
Vector<T1>.Count % 2 is 0 &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down
28 changes: 14 additions & 14 deletions src/NetFabric.Numerics.Tensors/ApplyTernary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, Rea
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Vector<T2>.IsSupported &&
Vector<T3>.IsSupported &&
Vector<TResult>.IsSupported &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down Expand Up @@ -178,14 +178,14 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, T2
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Vector<T2>.IsSupported &&
Vector<T3>.IsSupported &&
Vector<TResult>.IsSupported &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down Expand Up @@ -299,7 +299,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, (T2
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Expand All @@ -308,7 +308,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, (T2
Vector<TResult>.IsSupported &&
Vector<T1>.Count > 2 &&
Vector<T1>.Count % 2 is 0 &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down Expand Up @@ -474,14 +474,14 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, Rea
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Vector<T2>.IsSupported &&
Vector<T3>.IsSupported &&
Vector<TResult>.IsSupported &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down Expand Up @@ -595,7 +595,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, Rea
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Expand All @@ -604,7 +604,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, Rea
Vector<TResult>.IsSupported &&
Vector<T1>.Count > 2 &&
Vector<T1>.Count % 2 is 0 &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down Expand Up @@ -776,14 +776,14 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, T2
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Vector<T2>.IsSupported &&
Vector<T3>.IsSupported &&
Vector<TResult>.IsSupported &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down Expand Up @@ -890,7 +890,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, (T2
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T1>.IsSupported &&
Expand All @@ -899,7 +899,7 @@ public static void Apply<T1, T2, T3, TResult, TOperator>(ReadOnlySpan<T1> x, (T2
Vector<TResult>.IsSupported &&
Vector<T1>.Count > 2 &&
Vector<T1>.Count % 2 is 0 &&
x.Length >= Vector<T1>.Count)
x.Length > Vector<T1>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var xVectors = MemoryMarshal.Cast<T1, Vector<T1>>(x);
Expand Down
34 changes: 19 additions & 15 deletions src/NetFabric.Numerics.Tensors/ApplyUnary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,27 +31,31 @@ public static void Apply<T, TResult, TOperator>(ReadOnlyMemory<T> x, Memory<TRes
if (x.Length > destination.Length)
Throw.ArgumentException(nameof(destination), "Destination span is too small.");

if(x.Length > 2 * minChunkSize)
ParallelApply(x, destination);
var coreCount = AvailableCores();

if (coreCount >= minChunkCount && x.Length > minChunkCount * minChunkSize)
ParallelApply(x, destination, coreCount);
else
Apply<T, TResult, TOperator>(x.Span, destination.Span);

static void ParallelApply(ReadOnlyMemory<T> source, Memory<TResult> destination)
static void ParallelApply(ReadOnlyMemory<T> x, Memory<TResult> destination, int coreCount)
{
var size = source.Length;
var chunkSize = int.Max(size / AvailableCores(), minChunkSize);
var totalSize = x.Length;
var chunkSize = int.Max(totalSize / coreCount, minChunkSize);

var actions = new Action[size / chunkSize];
var actions = new Action[totalSize / chunkSize];
var start = 0;
for (var index = 0; index < actions.Length; index++)
{
var start = index * chunkSize;
var length = (index == actions.Length - 1)
? size - start
var length = (index == actions.Length - 1)
? totalSize - start
: chunkSize;

var sourceSlice = source.Slice(start, length);
var xSlice = x.Slice(start, length);
var destinationSlice = destination.Slice(start, length);
actions[index] = () => Apply<T, TResult, TOperator>(sourceSlice.Span, destinationSlice.Span);
actions[index] = () => Apply<T, TResult, TOperator>(xSlice.Span, destinationSlice.Span);

start += length;
}
Parallel.Invoke(actions);
}
Expand Down Expand Up @@ -98,12 +102,12 @@ public static void Apply<T, TResult, TOperator>(ReadOnlySpan<T> x, Span<TResult>
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T>.IsSupported &&
Vector<TResult>.IsSupported &&
x.Length >= Vector<T>.Count)
x.Length > Vector<T>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var sourceVectors = MemoryMarshal.Cast<T, Vector<T>>(x);
Expand Down Expand Up @@ -215,14 +219,14 @@ public static void Apply2<T, TResult1, TResult2, TOperator1, TOperator2>(ReadOnl
var indexSource = 0;

// Check if hardware acceleration and Vector<T> support are available,
// and if the length of the x is greater than the Vector<T>.Count.
// and if the length of the x is greater than the length of Vector<T>.
if (TOperator1.IsVectorizable &&
TOperator2.IsVectorizable &&
Vector.IsHardwareAccelerated &&
Vector<T>.IsSupported &&
Vector<TResult1>.IsSupported &&
Vector<TResult2>.IsSupported &&
x.Length >= Vector<T>.Count)
x.Length > Vector<T>.Count)
{
// Cast the spans to vectors for hardware acceleration.
var sourceVectors = MemoryMarshal.Cast<T, Vector<T>>(x);
Expand Down
3 changes: 2 additions & 1 deletion src/NetFabric.Numerics.Tensors/Tensor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ namespace NetFabric.Numerics.Tensors;
/// </remarks>
public static partial class Tensor
{
const int minChunkSize = 100;
const int minChunkSize = 1_000;
const int minChunkCount = 4;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
static int AvailableCores()
Expand Down

0 comments on commit d4c13b0

Please sign in to comment.