From 4fb1859dddde1d38cacc89fa597c9bb6cb35af05 Mon Sep 17 00:00:00 2001 From: Nicolas Portmann Date: Thu, 21 Jan 2021 16:47:05 +0100 Subject: [PATCH] Vectorize Scale16X16To8X8 --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 2 + .../Formats/Jpeg/Components/Block8x8F.cs | 51 +++++++++++++++++++ .../Block8x8F_Scale16X16To8X8.cs | 35 +++++++++++++ 3 files changed, 88 insertions(+) create mode 100644 tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 475d64bc4f..4faf577fd9 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -19,6 +19,8 @@ public static class HwIntrinsics public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; + public static ReadOnlySpan PermuteMaskSwitchInnerDWords8x32 => new byte[] { 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0 }; + private static ReadOnlySpan ShuffleMaskPad4Nx16 => new byte[] { 0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80 }; private static ReadOnlySpan ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 }; diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index fd4748fa9d..3a29e21d9c 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -485,6 +485,57 @@ public static unsafe void Quantize( /// The source block. public static unsafe void Scale16X16To8X8(ref Block8x8F destination, ReadOnlySpan source) { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + Scale16X16To8X8Vectorized(ref destination, source); + return; + } +#endif + + Scale16X16To8X8Scalar(ref destination, source); + } + + private static void Scale16X16To8X8Vectorized(ref Block8x8F destination, ReadOnlySpan source) + { +#if SUPPORTS_RUNTIME_INTRINSICS + Debug.Assert(Avx2.IsSupported, "AVX2 is required to execute this method"); + + var f2 = Vector256.Create(2f); + var f025 = Vector256.Create(0.25f); + Vector256 switchInnerDoubleWords = Unsafe.As>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32)); + + ref Vector256 in1 = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector256 in2 = ref Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), 1)); + ref Vector256 destRef = ref Unsafe.As>(ref destination); + + for (int i = 0; i < 8; i++) + { + Vector256 a = in1; + Vector256 b = Unsafe.Add(ref in1, 1); + Vector256 c = in2; + Vector256 d = Unsafe.Add(ref in2, 1); + + Vector256 calc1 = Avx.Shuffle(a, c, 0b10_00_10_00); + Vector256 calc2 = Avx.Shuffle(a, c, 0b11_01_11_01); + Vector256 calc3 = Avx.Shuffle(b, d, 0b10_00_10_00); + Vector256 calc4 = Avx.Shuffle(b, d, 0b11_01_11_01); + + Vector256 sum = Avx.Add(Avx.Add(calc1, calc2), Avx.Add(calc3, calc4)); + Vector256 add = Avx.Add(sum, f2); + Vector256 res = Avx.Multiply(add, f025); + + destRef = Avx2.PermuteVar8x32(res, switchInnerDoubleWords); + destRef = ref Unsafe.Add(ref destRef, 1); + + in1 = ref Unsafe.Add(ref in1, 2); + in2 = ref Unsafe.Add(ref in2, 2); + } +#endif + } + + private static unsafe void Scale16X16To8X8Scalar(ref Block8x8F destination, ReadOnlySpan source) + { for (int i = 0; i < 4; i++) { int dstOff = ((i & 2) << 4) | ((i & 1) << 2); diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs new file mode 100644 index 0000000000..8188297608 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs @@ -0,0 +1,35 @@ +using System; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg.Components; + +namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Block8x8F_Scale16X16To8X8 + { + private Block8x8F source; + private readonly Block8x8F[] target = new Block8x8F[4]; + + [GlobalSetup] + public void Setup() + { + var random = new Random(); + + float[] f = new float[8*8]; + for (int i = 0; i < f.Length; i++) + { + f[i] = (float)random.NextDouble(); + } + + for (int i = 0; i < 4; i++) + { + this.target[i] = Block8x8F.Load(f); + } + + this.source = Block8x8F.Load(f); + } + + [Benchmark] + public void Scale16X16To8X8() => Block8x8F.Scale16X16To8X8(ref this.source, this.target); + } +}