diff --git a/src/ImageSharp/Common/Helpers/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/IComponentShuffle.cs
new file mode 100644
index 0000000000..e354a57b00
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/IComponentShuffle.cs
@@ -0,0 +1,165 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Buffers.Binary;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ ///
+ /// Defines the contract for methods that allow the shuffling of pixel components.
+ /// Used for shuffling on platforms that do not support Hardware Intrinsics.
+ ///
+ internal interface IComponentShuffle
+ {
+ ///
+ /// Gets the shuffle control.
+ ///
+ byte Control { get; }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ void RunFallbackShuffle(ReadOnlySpan source, Span dest);
+ }
+
+ internal readonly struct DefaultShuffle4 : IComponentShuffle
+ {
+ public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0)
+ : this(SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0))
+ {
+ }
+
+ public DefaultShuffle4(byte control) => this.Control = control;
+
+ public byte Control { get; }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ref byte sBase = ref MemoryMarshal.GetReference(source);
+ ref byte dBase = ref MemoryMarshal.GetReference(dest);
+ SimdUtils.Shuffle.InverseMmShuffle(
+ this.Control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0);
+
+ for (int i = 0; i < source.Length; i += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+ }
+ }
+ }
+
+ internal readonly struct WXYZShuffle4 : IComponentShuffle
+ {
+ public byte Control => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ReadOnlySpan s = MemoryMarshal.Cast(source);
+ Span d = MemoryMarshal.Cast(dest);
+ ref uint sBase = ref MemoryMarshal.GetReference(s);
+ ref uint dBase = ref MemoryMarshal.GetReference(d);
+
+ // The JIT can detect and optimize rotation idioms ROTL (Rotate Left)
+ // and ROTR (Rotate Right) emitting efficient CPU instructions:
+ // https://github.com/dotnet/coreclr/pull/1830
+ for (int i = 0; i < s.Length; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // ROTL(8, packed) = [Z Y X W]
+ Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24);
+ }
+ }
+ }
+
+ internal readonly struct WZYXShuffle4 : IComponentShuffle
+ {
+ public byte Control => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ReadOnlySpan s = MemoryMarshal.Cast(source);
+ Span d = MemoryMarshal.Cast(dest);
+ ref uint sBase = ref MemoryMarshal.GetReference(s);
+ ref uint dBase = ref MemoryMarshal.GetReference(d);
+
+ for (int i = 0; i < s.Length; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // REVERSE(packedArgb) = [X Y Z W]
+ Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed);
+ }
+ }
+ }
+
+ internal readonly struct YZWXShuffle4 : IComponentShuffle
+ {
+ public byte Control => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ReadOnlySpan s = MemoryMarshal.Cast(source);
+ Span d = MemoryMarshal.Cast(dest);
+ ref uint sBase = ref MemoryMarshal.GetReference(s);
+ ref uint dBase = ref MemoryMarshal.GetReference(d);
+
+ for (int i = 0; i < s.Length; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // ROTR(8, packedArgb) = [Y Z W X]
+ Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24);
+ }
+ }
+ }
+
+ internal readonly struct ZYXWShuffle4 : IComponentShuffle
+ {
+ public byte Control => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ {
+ ReadOnlySpan s = MemoryMarshal.Cast(source);
+ Span d = MemoryMarshal.Cast(dest);
+ ref uint sBase = ref MemoryMarshal.GetReference(s);
+ ref uint dBase = ref MemoryMarshal.GetReference(d);
+
+ for (int i = 0; i < s.Length; i++)
+ {
+ uint packed = Unsafe.Add(ref sBase, i);
+
+ // packed = [W Z Y X]
+ // tmp1 = [W 0 Y 0]
+ // tmp2 = [0 Z 0 X]
+ // tmp3=ROTL(16, tmp2) = [0 X 0 Z]
+ // tmp1 + tmp3 = [W X Y Z]
+ uint tmp1 = packed & 0xFF00FF00;
+ uint tmp2 = packed & 0x00FF00FF;
+ uint tmp3 = (tmp2 << 16) | (tmp2 >> 16);
+
+ Unsafe.Add(ref dBase, i) = tmp1 + tmp3;
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 2d788992ee..782328eddf 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -18,6 +18,234 @@ public static class HwIntrinsics
public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 };
+ ///
+ /// Shuffle single-precision (32-bit) floating-point elements in
+ /// using the control and store the results in .
+ ///
+ /// The source span of floats.
+ /// The destination span of floats.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4ChannelReduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx.IsSupported || Sse.IsSupported)
+ {
+ int remainder = Avx.IsSupported
+ ? ImageMaths.ModuloP2(source.Length, Vector256.Count)
+ : ImageMaths.ModuloP2(source.Length, Vector128.Count);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4Channel(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ ///
+ /// Shuffle 8-bit integers in a within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4ChannelReduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported || Ssse3.IsSupported)
+ {
+ int remainder = Avx2.IsSupported
+ ? ImageMaths.ModuloP2(source.Length, Vector256.Count)
+ : ImageMaths.ModuloP2(source.Length, Vector128.Count);
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4Channel(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx.IsSupported)
+ {
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector256.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+
+ vd0 = Avx.Permute(vs0, control);
+ Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control);
+ Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control);
+ Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control);
+ }
+ }
+ }
+ else
+ {
+ // Sse
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector128.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+
+ vd0 = Sse.Shuffle(vs0, vs0, control);
+
+ Vector128 vs1 = Unsafe.Add(ref vs0, 1);
+ Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control);
+
+ Vector128 vs2 = Unsafe.Add(ref vs0, 2);
+ Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control);
+
+ Vector128 vs3 = Unsafe.Add(ref vs0, 3);
+ Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Vector128 vs = Unsafe.Add(ref sourceBase, i);
+ Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control);
+ }
+ }
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported)
+ {
+ // I've chosen to do this for convenience while we determine what
+ // shuffle controls to add to the library.
+ // We can add static ROS instances if need be in the future.
+ Span bytes = stackalloc byte[Vector256.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector256 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector256.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+
+ vd0 = Avx2.Shuffle(vs0, vcm);
+ Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vcm);
+ Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vcm);
+ Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vcm);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm);
+ }
+ }
+ }
+ else
+ {
+ // Ssse3
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ int n = dest.Length / Vector128.Count;
+ int m = ImageMaths.Modulo4(n);
+ int u = n - m;
+
+ for (int i = 0; i < u; i += 4)
+ {
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+
+ vd0 = Ssse3.Shuffle(vs0, vcm);
+ Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vcm);
+ Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vcm);
+ Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vcm);
+ }
+
+ if (m > 0)
+ {
+ for (int i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vcm);
+ }
+ }
+ }
+ }
+
///
/// Performs a multiplication and an addition of the .
///
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
new file mode 100644
index 0000000000..a4a40fb4fa
--- /dev/null
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
@@ -0,0 +1,141 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+ internal static partial class SimdUtils
+ {
+ ///
+ /// Shuffle single-precision (32-bit) floating-point elements in
+ /// using the control and store the results in .
+ ///
+ /// The source span of floats.
+ /// The destination span of floats.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ VerifyShuffleSpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ ShuffleRemainder4Channel(source, dest, control);
+ }
+ }
+
+ ///
+ /// Shuffle 8-bit integers within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The type of shuffle to perform.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ TShuffle shuffle)
+ where TShuffle : struct, IComponentShuffle
+ {
+ VerifyShuffleSpanInput(source, dest);
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, shuffle.Control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ shuffle.RunFallbackShuffle(source, dest);
+ }
+ }
+
+ public static void ShuffleRemainder4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ ref float sBase = ref MemoryMarshal.GetReference(source);
+ ref float dBase = ref MemoryMarshal.GetReference(dest);
+ Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0);
+
+ for (int i = 0; i < source.Length; i += 4)
+ {
+ Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i);
+ Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
+ Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+ Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+ }
+ }
+
+ [Conditional("DEBUG")]
+ private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest)
+ where T : struct
+ {
+ DebugGuard.IsTrue(
+ source.Length == dest.Length,
+ nameof(source),
+ "Input spans must be of same length!");
+
+ DebugGuard.IsTrue(
+ source.Length % 4 == 0,
+ nameof(source),
+ "Input spans must be divisiable by 4!");
+ }
+
+ public static class Shuffle
+ {
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static byte MmShuffle(byte p3, byte p2, byte p1, byte p0)
+ => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void MmShuffleSpan(ref Span span, byte control)
+ {
+ InverseMmShuffle(
+ control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0);
+
+ ref byte spanBase = ref MemoryMarshal.GetReference(span);
+
+ for (int i = 0; i < span.Length; i += 4)
+ {
+ Unsafe.Add(ref spanBase, i) = (byte)(p0 + i);
+ Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i);
+ Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i);
+ Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i);
+ }
+ }
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void InverseMmShuffle(
+ byte control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0)
+ {
+ p3 = control >> 6 & 0x3;
+ p2 = control >> 4 & 0x3;
+ p1 = control >> 2 & 0x3;
+ p0 = control >> 0 & 0x3;
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs
index 0b1292b641..3f48d2acca 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs
@@ -53,66 +53,58 @@ public override void ToVector4(Configuration configuration, ReadOnlySpan
Vector4Converters.RgbaCompatible.ToVector4(configuration, this, sourcePixels, destVectors, modifiers.Remove(PixelConversionModifiers.Scale));
}
///
- public override void ToRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void ToRgba32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToRgba32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromArgb32.ToRgba32(source, dest);
}
///
- public override void FromRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void FromRgba32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToArgb32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromRgba32.ToArgb32(source, dest);
}
///
- public override void ToBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void ToBgra32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToBgra32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromArgb32.ToBgra32(source, dest);
}
///
- public override void FromBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void FromBgra32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToArgb32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromBgra32.ToArgb32(source, dest);
}
///
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs
index 5bdd10404d..8cf2d5850a 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs
@@ -53,66 +53,58 @@ public override void ToVector4(Configuration configuration, ReadOnlySpan
Vector4Converters.RgbaCompatible.ToVector4(configuration, this, sourcePixels, destVectors, modifiers.Remove(PixelConversionModifiers.Scale));
}
///
- public override void ToRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void ToRgba32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToRgba32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromBgra32.ToRgba32(source, dest);
}
///
- public override void FromRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void FromRgba32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToBgra32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromRgba32.ToBgra32(source, dest);
}
///
- public override void ToArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void ToArgb32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToArgb32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromBgra32.ToArgb32(source, dest);
}
///
- public override void FromArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void FromArgb32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToBgra32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromArgb32.ToBgra32(source, dest);
}
///
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs
index b05c62f1f7..9a36ec29a4 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs
@@ -42,66 +42,58 @@ public override void ToRgba32(Configuration configuration, ReadOnlySpan
}
///
- public override void ToArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void ToArgb32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToArgb32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromRgba32.ToArgb32(source, dest);
}
///
- public override void FromArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void FromArgb32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToRgba32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromArgb32.ToRgba32(source, dest);
}
///
- public override void ToBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void ToBgra32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToBgra32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromRgba32.ToBgra32(source, dest);
}
///
- public override void FromBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels)
+ public override void FromBgra32(
+ Configuration configuration,
+ ReadOnlySpan sourcePixels,
+ Span destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToRgba32(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels);
+ Span dest = MemoryMarshal.Cast(destinationPixels);
+ PixelConverter.FromBgra32.ToRgba32(source, dest);
}
///
diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude
index 5d56731ba6..d8b5286cd7 100644
--- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude
+++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude
@@ -88,35 +88,31 @@ using System.Runtime.InteropServices;
{
#>
///
- public override void To<#=otherPixelType#>(Configuration configuration, ReadOnlySpan<<#=thisPixelType#>> sourcePixels, Span<<#=otherPixelType#>> destinationPixels)
+ public override void To<#=otherPixelType#>(
+ Configuration configuration,
+ ReadOnlySpan<<#=thisPixelType#>> sourcePixels,
+ Span<<#=otherPixelType#>> destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As<<#=thisPixelType#>,uint>(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As<<#=otherPixelType#>, uint>(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.From<#=thisPixelType#>.To<#=otherPixelType#>(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast<<#=thisPixelType#>, byte>(sourcePixels);
+ Span dest = MemoryMarshal.Cast<<#=otherPixelType#>, byte>(destinationPixels);
+ PixelConverter.From<#=thisPixelType#>.To<#=otherPixelType#>(source, dest);
}
///
- public override void From<#=otherPixelType#>(Configuration configuration, ReadOnlySpan<<#=otherPixelType#>> sourcePixels, Span<<#=thisPixelType#>> destinationPixels)
+ public override void From<#=otherPixelType#>(
+ Configuration configuration,
+ ReadOnlySpan<<#=otherPixelType#>> sourcePixels,
+ Span<<#=thisPixelType#>> destinationPixels)
{
Guard.NotNull(configuration, nameof(configuration));
Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels));
- ref uint sourceRef = ref Unsafe.As<<#=otherPixelType#>,uint>(ref MemoryMarshal.GetReference(sourcePixels));
- ref uint destRef = ref Unsafe.As<<#=thisPixelType#>, uint>(ref MemoryMarshal.GetReference(destinationPixels));
-
- for (int i = 0; i < sourcePixels.Length; i++)
- {
- uint sp = Unsafe.Add(ref sourceRef, i);
- Unsafe.Add(ref destRef, i) = PixelConverter.From<#=otherPixelType#>.To<#=thisPixelType#>(sp);
- }
+ ReadOnlySpan source = MemoryMarshal.Cast<<#=otherPixelType#>, byte>(sourcePixels);
+ Span dest = MemoryMarshal.Cast<<#=thisPixelType#>, byte>(destinationPixels);
+ PixelConverter.From<#=otherPixelType#>.To<#=thisPixelType#>(source, dest);
}
<#+
}
diff --git a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs
index 8142640848..ab9011a5c7 100644
--- a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs
+++ b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs
@@ -1,7 +1,7 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
-using System.Buffers.Binary;
+using System;
using System.Runtime.CompilerServices;
namespace SixLabors.ImageSharp.PixelFormats.Utils
@@ -21,88 +21,64 @@ internal static class PixelConverter
public static class FromRgba32
{
///
- /// Converts a packed to .
+ /// Converts a representing a collection of
+ /// pixels to a representing
+ /// a collection of pixels.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public static uint ToArgb32(uint packedRgba)
- {
- // packedRgba = [aa bb gg rr]
- // ROTL(8, packedRgba) = [bb gg rr aa]
- return (packedRgba << 8) | (packedRgba >> 24);
- }
+ public static void ToArgb32(ReadOnlySpan source, Span dest)
+ => SimdUtils.Shuffle4Channel(source, dest, default);
///
- /// Converts a packed to .
+ /// Converts a representing a collection of
+ /// pixels to a representing
+ /// a collection of pixels.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public static uint ToBgra32(uint packedRgba)
- {
- // packedRgba = [aa bb gg rr]
- // tmp1 = [aa 00 gg 00]
- // tmp2 = [00 bb 00 rr]
- // tmp3=ROTL(16, tmp2) = [00 rr 00 bb]
- // tmp1 + tmp3 = [aa rr gg bb]
- uint tmp1 = packedRgba & 0xFF00FF00;
- uint tmp2 = packedRgba & 0x00FF00FF;
- uint tmp3 = (tmp2 << 16) | (tmp2 >> 16);
- return tmp1 + tmp3;
- }
+ public static void ToBgra32(ReadOnlySpan source, Span dest)
+ => SimdUtils.Shuffle4Channel(source, dest, default);
}
public static class FromArgb32
{
///
- /// Converts a packed to .
+ /// Converts a representing a collection of
+ /// pixels to a representing
+ /// a collection of pixels.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public static uint ToRgba32(uint packedArgb)
- {
- // packedArgb = [bb gg rr aa]
- // ROTR(8, packedArgb) = [aa bb gg rr]
- return (packedArgb >> 8) | (packedArgb << 24);
- }
+ public static void ToRgba32(ReadOnlySpan source, Span dest)
+ => SimdUtils.Shuffle4Channel(source, dest, default);
///
- /// Converts a packed to .
+ /// Converts a representing a collection of
+ /// pixels to a representing
+ /// a collection of pixels.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public static uint ToBgra32(uint packedArgb)
- {
- // packedArgb = [bb gg rr aa]
- // REVERSE(packedArgb) = [aa rr gg bb]
- return BinaryPrimitives.ReverseEndianness(packedArgb);
- }
+ public static void ToBgra32(ReadOnlySpan source, Span dest)
+ => SimdUtils.Shuffle4Channel(source, dest, default);
}
public static class FromBgra32
{
///
- /// Converts a packed to .
+ /// Converts a representing a collection of
+ /// pixels to a representing
+ /// a collection of pixels.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public static uint ToArgb32(uint packedBgra)
- {
- // packedBgra = [aa rr gg bb]
- // REVERSE(packedBgra) = [bb gg rr aa]
- return BinaryPrimitives.ReverseEndianness(packedBgra);
- }
+ public static void ToArgb32(ReadOnlySpan source, Span dest)
+ => SimdUtils.Shuffle4Channel(source, dest, default);
///
- /// Converts a packed to .
+ /// Converts a representing a collection of
+ /// pixels to a representing
+ /// a collection of pixels.
///
[MethodImpl(InliningOptions.ShortMethod)]
- public static uint ToRgba32(uint packedBgra)
- {
- // packedRgba = [aa rr gg bb]
- // tmp1 = [aa 00 gg 00]
- // tmp2 = [00 rr 00 bb]
- // tmp3=ROTL(16, tmp2) = [00 bb 00 rr]
- // tmp1 + tmp3 = [aa bb gg rr]
- uint tmp1 = packedBgra & 0xFF00FF00;
- uint tmp2 = packedBgra & 0x00FF00FF;
- uint tmp3 = (tmp2 << 16) | (tmp2 >> 16);
- return tmp1 + tmp3;
- }
+ public static void ToRgba32(ReadOnlySpan source, Span dest)
+ => SimdUtils.Shuffle4Channel(source, dest, default);
}
}
-}
\ No newline at end of file
+}
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs
new file mode 100644
index 0000000000..749859eac9
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs
@@ -0,0 +1,67 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using BenchmarkDotNet.Attributes;
+
+namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
+{
+ [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
+ public class ShuffleByte4Channel
+ {
+ private byte[] source;
+ private byte[] destination;
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ this.source = new byte[this.Count];
+ new Random(this.Count).NextBytes(this.source);
+ this.destination = new byte[this.Count];
+ }
+
+ [Params(128, 256, 512, 1024, 2048)]
+ public int Count { get; set; }
+
+ [Benchmark]
+ public void Shuffle4Channel()
+ {
+ SimdUtils.Shuffle4Channel(this.source, this.destination, default);
+ }
+ }
+
+ // 2020-10-29
+ // ##########
+ //
+ // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
+ // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
+ // .NET Core SDK=3.1.403
+ // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ //
+ // Runtime=.NET Core 3.1
+ //
+ // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
+ // |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 17.39 ns | 0.187 ns | 0.175 ns | 1.00 | 0.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 128 | 21.72 ns | 0.299 ns | 0.279 ns | 1.25 | 0.02 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 18.10 ns | 0.346 ns | 0.289 ns | 1.04 | 0.02 | - | - | - | - |
+ // | | | | | | | | | | | | | |
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 35.51 ns | 0.711 ns | 0.790 ns | 1.00 | 0.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 256 | 23.90 ns | 0.508 ns | 0.820 ns | 0.69 | 0.02 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 20.40 ns | 0.133 ns | 0.111 ns | 0.57 | 0.01 | - | - | - | - |
+ // | | | | | | | | | | | | | |
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 73.39 ns | 0.310 ns | 0.259 ns | 1.00 | 0.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 512 | 26.10 ns | 0.418 ns | 0.391 ns | 0.36 | 0.01 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 27.59 ns | 0.556 ns | 0.571 ns | 0.38 | 0.01 | - | - | - | - |
+ // | | | | | | | | | | | | | |
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 150.64 ns | 2.903 ns | 2.716 ns | 1.00 | 0.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 1024 | 38.67 ns | 0.801 ns | 1.889 ns | 0.24 | 0.02 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 47.13 ns | 0.948 ns | 1.054 ns | 0.31 | 0.01 | - | - | - | - |
+ // | | | | | | | | | | | | | |
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 315.29 ns | 5.206 ns | 6.583 ns | 1.00 | 0.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 2048 | 57.37 ns | 1.152 ns | 1.078 ns | 0.18 | 0.01 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 65.75 ns | 1.198 ns | 1.600 ns | 0.21 | 0.01 | - | - | - | - |
+}
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs
new file mode 100644
index 0000000000..6f5b5001be
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs
@@ -0,0 +1,68 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Tests;
+
+namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
+{
+ [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
+ public class ShuffleFloat4Channel
+ {
+ private static readonly byte control = default(WXYZShuffle4).Control;
+ private float[] source;
+ private float[] destination;
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ this.source = new Random(this.Count).GenerateRandomFloatArray(this.Count, 0, 256);
+ this.destination = new float[this.Count];
+ }
+
+ [Params(128, 256, 512, 1024, 2048)]
+ public int Count { get; set; }
+
+ [Benchmark]
+ public void Shuffle4Channel()
+ {
+ SimdUtils.Shuffle4Channel(this.source, this.destination, control);
+ }
+ }
+
+ // 2020-10-29
+ // ##########
+ //
+ // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
+ // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
+ // .NET Core SDK=3.1.403
+ // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ //
+ // Runtime=.NET Core 3.1
+ //
+ // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated |
+ // |---------------- |------------------- |-------------------------------------------------- |------ |-----------:|----------:|----------:|------:|------:|------:|------:|----------:|
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.647 ns | 0.5475 ns | 0.4853 ns | 1.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 128 | 9.818 ns | 0.1457 ns | 0.1292 ns | 0.15 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 15.267 ns | 0.1005 ns | 0.0940 ns | 0.24 | - | - | - | - |
+ // | | | | | | | | | | | | |
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 125.586 ns | 1.9312 ns | 1.8064 ns | 1.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 256 | 15.878 ns | 0.1983 ns | 0.1758 ns | 0.13 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 29.170 ns | 0.2925 ns | 0.2442 ns | 0.23 | - | - | - | - |
+ // | | | | | | | | | | | | |
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 263.859 ns | 2.6660 ns | 2.3634 ns | 1.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 512 | 29.452 ns | 0.3334 ns | 0.3118 ns | 0.11 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 52.912 ns | 0.1932 ns | 0.1713 ns | 0.20 | - | - | - | - |
+ // | | | | | | | | | | | | |
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 495.717 ns | 1.9850 ns | 1.8567 ns | 1.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 1024 | 53.757 ns | 0.3212 ns | 0.2847 ns | 0.11 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 107.815 ns | 1.6201 ns | 1.3528 ns | 0.22 | - | - | - | - |
+ // | | | | | | | | | | | | |
+ // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 980.134 ns | 3.7407 ns | 3.1237 ns | 1.00 | - | - | - | - |
+ // | Shuffle4Channel | 2. AVX | Empty | 2048 | 105.120 ns | 0.6140 ns | 0.5443 ns | 0.11 | - | - | - | - |
+ // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 216.473 ns | 2.3268 ns | 2.0627 ns | 0.22 | - | - | - | - |
+}
diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
index e8a06bf24e..5ceb4c8a00 100644
--- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
+++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
@@ -58,25 +58,26 @@ public class HwIntrinsics_SSE_AVX : Config
{
public HwIntrinsics_SSE_AVX()
{
+ this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
+ .WithEnvironmentVariables(
+ new EnvironmentVariable(EnableHWIntrinsic, Off),
+ new EnvironmentVariable(FeatureSIMD, Off))
+ .WithId("1. No HwIntrinsics").AsBaseline());
+
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
- .WithId("AVX").AsBaseline());
+ .WithId("2. AVX"));
}
if (Sse.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
- .WithId("SSE"));
+ .WithId("3. SSE"));
}
#endif
- this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31)
- .WithEnvironmentVariables(
- new EnvironmentVariable(EnableHWIntrinsic, Off),
- new EnvironmentVariable(FeatureSIMD, Off))
- .WithId("No HwIntrinsics"));
}
}
}
diff --git a/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_ConvertFromRgba32.cs b/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_ConvertFromRgba32.cs
index 7d6c2efedf..a933f890fc 100644
--- a/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_ConvertFromRgba32.cs
+++ b/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_ConvertFromRgba32.cs
@@ -168,49 +168,27 @@ public void InlineShuffle()
[Benchmark]
public void PixelConverter_Rgba32_ToArgb32()
{
- ref uint sBase = ref Unsafe.As(ref this.PermutedRunnerRgbaToArgb.Source[0]);
- ref uint dBase = ref Unsafe.As(ref this.PermutedRunnerRgbaToArgb.Dest[0]);
+ Span source = MemoryMarshal.Cast(this.PermutedRunnerRgbaToArgb.Source);
+ Span dest = MemoryMarshal.Cast(this.PermutedRunnerRgbaToArgb.Dest);
- for (int i = 0; i < this.Count; i++)
- {
- uint s = Unsafe.Add(ref sBase, i);
- Unsafe.Add(ref dBase, i) = PixelConverter.FromRgba32.ToArgb32(s);
- }
- }
-
- [Benchmark]
- public void PixelConverter_Rgba32_ToArgb32_CopyThenWorkOnSingleBuffer()
- {
- Span source = MemoryMarshal.Cast(this.PermutedRunnerRgbaToArgb.Source);
- Span dest = MemoryMarshal.Cast(this.PermutedRunnerRgbaToArgb.Dest);
- source.CopyTo(dest);
-
- ref uint dBase = ref MemoryMarshal.GetReference(dest);
-
- for (int i = 0; i < this.Count; i++)
- {
- uint s = Unsafe.Add(ref dBase, i);
- Unsafe.Add(ref dBase, i) = PixelConverter.FromRgba32.ToArgb32(s);
- }
+ PixelConverter.FromRgba32.ToArgb32(source, dest);
}
/*
RESULTS:
- Method | Count | Mean | Error | StdDev | Scaled | ScaledSD |
- ---------------------------------------------------------- |------ |-----------:|-----------:|-----------:|-------:|---------:|
- ByRef | 256 | 328.7 ns | 6.6141 ns | 6.1868 ns | 1.00 | 0.00 |
- ByVal | 256 | 322.0 ns | 4.3541 ns | 4.0728 ns | 0.98 | 0.02 |
- FromBytes | 256 | 321.5 ns | 3.3499 ns | 3.1335 ns | 0.98 | 0.02 |
- InlineShuffle | 256 | 330.7 ns | 4.2525 ns | 3.9778 ns | 1.01 | 0.02 |
- PixelConverter_Rgba32_ToArgb32 | 256 | 167.4 ns | 0.6357 ns | 0.5309 ns | 0.51 | 0.01 |
- PixelConverter_Rgba32_ToArgb32_CopyThenWorkOnSingleBuffer | 256 | 196.6 ns | 0.8929 ns | 0.7915 ns | 0.60 | 0.01 |
- | | | | | | |
- ByRef | 2048 | 2,534.4 ns | 8.2947 ns | 6.9265 ns | 1.00 | 0.00 |
- ByVal | 2048 | 2,638.5 ns | 52.6843 ns | 70.3320 ns | 1.04 | 0.03 |
- FromBytes | 2048 | 2,517.2 ns | 40.8055 ns | 38.1695 ns | 0.99 | 0.01 |
- InlineShuffle | 2048 | 2,546.5 ns | 21.2506 ns | 19.8778 ns | 1.00 | 0.01 |
- PixelConverter_Rgba32_ToArgb32 | 2048 | 1,265.7 ns | 5.1397 ns | 4.5562 ns | 0.50 | 0.00 |
- PixelConverter_Rgba32_ToArgb32_CopyThenWorkOnSingleBuffer | 2048 | 1,410.3 ns | 11.1939 ns | 9.9231 ns | 0.56 | 0.00 |
- */
+ | Method | Count | Mean | Error | StdDev | Median | Ratio | RatioSD |
+ |------------------------------- |------ |------------:|----------:|----------:|------------:|------:|--------:|
+ | ByRef | 256 | 288.84 ns | 19.601 ns | 52.319 ns | 268.10 ns | 1.00 | 0.00 |
+ | ByVal | 256 | 267.97 ns | 1.831 ns | 1.713 ns | 267.85 ns | 0.77 | 0.18 |
+ | FromBytes | 256 | 266.81 ns | 2.427 ns | 2.270 ns | 266.47 ns | 0.76 | 0.18 |
+ | InlineShuffle | 256 | 291.41 ns | 5.820 ns | 5.444 ns | 290.17 ns | 0.83 | 0.19 |
+ | PixelConverter_Rgba32_ToArgb32 | 256 | 38.62 ns | 0.431 ns | 0.403 ns | 38.68 ns | 0.11 | 0.03 |
+ | | | | | | | | |
+ | ByRef | 2048 | 2,197.69 ns | 15.826 ns | 14.804 ns | 2,197.25 ns | 1.00 | 0.00 |
+ | ByVal | 2048 | 2,226.81 ns | 44.266 ns | 62.054 ns | 2,197.17 ns | 1.03 | 0.04 |
+ | FromBytes | 2048 | 2,181.35 ns | 18.033 ns | 16.868 ns | 2,185.97 ns | 0.99 | 0.01 |
+ | InlineShuffle | 2048 | 2,233.10 ns | 27.673 ns | 24.531 ns | 2,229.78 ns | 1.02 | 0.01 |
+ | PixelConverter_Rgba32_ToArgb32 | 2048 | 139.90 ns | 2.152 ns | 3.825 ns | 138.70 ns | 0.06 | 0.00 |
+ */
}
}
diff --git a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj
index eaab162ff2..4784a219b2 100644
--- a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj
+++ b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj
@@ -17,6 +17,7 @@
+
diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs
new file mode 100644
index 0000000000..06f61e617d
--- /dev/null
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs
@@ -0,0 +1,161 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Common
+{
+ public partial class SimdUtilsTests
+ {
+ [Theory]
+ [MemberData(nameof(ArraySizesDivisibleBy4))]
+ public void BulkShuffleFloat4Channel(int count)
+ {
+ static void RunTest(string serialized)
+ {
+ // No need to test multiple shuffle controls as the
+ // pipeline is always the same.
+ int size = FeatureTestRunner.Deserialize(serialized);
+ byte control = default(WZYXShuffle4).Control;
+
+ TestShuffleFloat4Channel(
+ size,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, control),
+ control);
+ }
+
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(
+ RunTest,
+ count,
+ HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE);
+ }
+
+ [Theory]
+ [MemberData(nameof(ArraySizesDivisibleBy4))]
+ public void BulkShuffleByte4Channel(int count)
+ {
+ static void RunTest(string serialized)
+ {
+ int size = FeatureTestRunner.Deserialize(serialized);
+ foreach (var item in ArraySizesDivisibleBy4)
+ {
+ // These cannot be expressed as a theory as you cannot
+ // use RemoteExecutor within generic methods nor pass
+ // IComponentShuffle to the generic utils method.
+ foreach (var count in item)
+ {
+ WXYZShuffle4 wxyz = default;
+ TestShuffleByte4Channel(
+ size,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wxyz),
+ wxyz.Control);
+
+ WZYXShuffle4 wzyx = default;
+ TestShuffleByte4Channel(
+ size,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wzyx),
+ wzyx.Control);
+
+ YZWXShuffle4 yzwx = default;
+ TestShuffleByte4Channel(
+ size,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, yzwx),
+ yzwx.Control);
+
+ ZYXWShuffle4 zyxw = default;
+ TestShuffleByte4Channel(
+ size,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, zyxw),
+ zyxw.Control);
+
+ var xwyz = new DefaultShuffle4(2, 1, 3, 0);
+ TestShuffleByte4Channel(
+ size,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, xwyz),
+ xwyz.Control);
+
+ var yyyy = new DefaultShuffle4(1, 1, 1, 1);
+ TestShuffleByte4Channel(
+ size,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, yyyy),
+ yyyy.Control);
+
+ var wwww = new DefaultShuffle4(3, 3, 3, 3);
+ TestShuffleByte4Channel(
+ size,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wwww),
+ wwww.Control);
+ }
+ }
+ }
+
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(
+ RunTest,
+ count,
+ HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE);
+ }
+
+ private static void TestShuffleFloat4Channel(
+ int count,
+ Action, Memory> convert,
+ byte control)
+ {
+ float[] source = new Random(count).GenerateRandomFloatArray(count, 0, 256);
+ var result = new float[count];
+
+ float[] expected = new float[count];
+
+ SimdUtils.Shuffle.InverseMmShuffle(
+ control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0);
+
+ for (int i = 0; i < expected.Length; i += 4)
+ {
+ expected[i] = source[p0 + i];
+ expected[i + 1] = source[p1 + i];
+ expected[i + 2] = source[p2 + i];
+ expected[i + 3] = source[p3 + i];
+ }
+
+ convert(source, result);
+
+ Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F));
+ }
+
+ private static void TestShuffleByte4Channel(
+ int count,
+ Action, Memory