diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs index bc6036903b..9d49b8c45f 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs @@ -2,17 +2,22 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using System.Text; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { /// - /// Represents a Jpeg block with coefficients. + /// 8x8 matrix of coefficients. /// // ReSharper disable once InconsistentNaming + [StructLayout(LayoutKind.Explicit)] internal unsafe struct Block8x8 : IEquatable { /// @@ -20,24 +25,44 @@ internal unsafe struct Block8x8 : IEquatable /// public const int Size = 64; +#pragma warning disable IDE0051 // Remove unused private member /// - /// A fixed size buffer holding the values. - /// See: - /// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/unsafe-code-pointers/fixed-size-buffers - /// + /// A placeholder buffer so the actual struct occupies exactly 64 * 2 bytes. /// + /// + /// This is not used directly in the code. + /// + [FieldOffset(0)] private fixed short data[Size]; - - /// - /// Initializes a new instance of the struct. - /// - /// A of coefficients - public Block8x8(Span coefficients) - { - ref byte selfRef = ref Unsafe.As(ref this); - ref byte sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(coefficients)); - Unsafe.CopyBlock(ref selfRef, ref sourceRef, Size * sizeof(short)); - } +#pragma warning restore IDE0051 + +#if SUPPORTS_RUNTIME_INTRINSICS + [FieldOffset(0)] + public Vector128 V0; + [FieldOffset(16)] + public Vector128 V1; + [FieldOffset(32)] + public Vector128 V2; + [FieldOffset(48)] + public Vector128 V3; + [FieldOffset(64)] + public Vector128 V4; + [FieldOffset(80)] + public Vector128 V5; + [FieldOffset(96)] + public Vector128 V6; + [FieldOffset(112)] + public Vector128 V7; + + [FieldOffset(0)] + public Vector256 V01; + [FieldOffset(32)] + public Vector256 V23; + [FieldOffset(64)] + public Vector256 V45; + [FieldOffset(96)] + public Vector256 V67; +#endif /// /// Gets or sets a value at the given index @@ -49,7 +74,8 @@ public short this[int idx] [MethodImpl(MethodImplOptions.AggressiveInlining)] get { - GuardBlockIndex(idx); + DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx)); + ref short selfRef = ref Unsafe.As(ref this); return Unsafe.Add(ref selfRef, idx); } @@ -57,7 +83,8 @@ public short this[int idx] [MethodImpl(MethodImplOptions.AggressiveInlining)] set { - GuardBlockIndex(idx); + DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx)); + ref short selfRef = ref Unsafe.As(ref this); Unsafe.Add(ref selfRef, idx) = value; } @@ -75,15 +102,9 @@ public short this[int idx] set => this[(y * 8) + x] = value; } - public static bool operator ==(Block8x8 left, Block8x8 right) - { - return left.Equals(right); - } + public static bool operator ==(Block8x8 left, Block8x8 right) => left.Equals(right); - public static bool operator !=(Block8x8 left, Block8x8 right) - { - return !left.Equals(right); - } + public static bool operator !=(Block8x8 left, Block8x8 right) => !left.Equals(right); /// /// Multiply all elements by a given @@ -149,34 +170,11 @@ public short this[int idx] return result; } - /// - /// Pointer-based "Indexer" (getter part) - /// - /// Block pointer - /// Index - /// The scaleVec value at the specified index - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static short GetScalarAt(Block8x8* blockPtr, int idx) - { - GuardBlockIndex(idx); - - short* fp = blockPtr->data; - return fp[idx]; - } - - /// - /// Pointer-based "Indexer" (setter part) - /// - /// Block pointer - /// Index - /// Value - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void SetScalarAt(Block8x8* blockPtr, int idx, short value) + public static Block8x8 Load(Span data) { - GuardBlockIndex(idx); - - short* fp = blockPtr->data; - fp[idx] = value; + Unsafe.SkipInit(out Block8x8 result); + result.LoadFrom(data); + return result; } /// @@ -194,7 +192,7 @@ public Block8x8F AsFloatBlock() /// public short[] ToArray() { - var result = new short[Size]; + short[] result = new short[Size]; this.CopyTo(result); return result; } @@ -206,7 +204,7 @@ public void CopyTo(Span destination) { ref byte selfRef = ref Unsafe.As(ref this); ref byte destRef = ref MemoryMarshal.GetReference(MemoryMarshal.Cast(destination)); - Unsafe.CopyBlock(ref destRef, ref selfRef, Size * sizeof(short)); + Unsafe.CopyBlockUnaligned(ref destRef, ref selfRef, Size * sizeof(short)); } /// @@ -220,6 +218,19 @@ public void CopyTo(Span destination) } } + /// + /// Load raw 16bit integers from source. + /// + /// Source + [MethodImpl(InliningOptions.ShortMethod)] + public void LoadFrom(Span source) + { + ref byte sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); + ref byte destRef = ref Unsafe.As(ref this); + + Unsafe.CopyBlockUnaligned(ref destRef, ref sourceRef, Size * sizeof(short)); + } + /// /// Cast and copy -s from the beginning of 'source' span. /// @@ -231,13 +242,6 @@ public void LoadFrom(Span source) } } - [Conditional("DEBUG")] - private static void GuardBlockIndex(int idx) - { - DebugGuard.MustBeLessThan(idx, Size, nameof(idx)); - DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx)); - } - /// public override string ToString() { @@ -271,15 +275,66 @@ public bool Equals(Block8x8 other) } /// - public override bool Equals(object obj) - { - return obj is Block8x8 other && this.Equals(other); - } + public override bool Equals(object obj) => obj is Block8x8 other && this.Equals(other); /// - public override int GetHashCode() + public override int GetHashCode() => (this[0] * 31) + this[1]; + + /// + /// Returns index of the last non-zero element in given matrix. + /// + /// + /// Index of the last non-zero element. Returns -1 if all elements are equal to zero. + /// + [MethodImpl(InliningOptions.ShortMethod)] + public nint GetLastNonZeroIndex() { - return (this[0] * 31) + this[1]; +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); + + Vector256 zero16 = Vector256.Zero; + + ref Vector256 mcuStride = ref Unsafe.As>(ref this); + + for (nint i = 3; i >= 0; i--) + { + int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i), zero16).AsByte()); + + if (areEqual != equalityMask) + { + // Each 2 bits represents comparison operation for each 2-byte element in input vectors + // LSB represents first element in the stride + // MSB represents last element in the stride + // lzcnt operation would calculate number of zero numbers at the end + + // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements + // So we need to invert it + int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual); + + // As input number is represented by 2 bits in the mask, we need to divide lzcnt result by 2 + // to get the exact number of zero elements in the stride + int strideRelativeIndex = 15 - (lzcnt / 2); + return (i * 16) + strideRelativeIndex; + } + } + + return -1; + } + else +#endif + { + nint index = Size - 1; + ref short elemRef = ref Unsafe.As(ref this); + + while (index >= 0 && Unsafe.Add(ref elemRef, index) == 0) + { + index--; + } + + return index; + } } /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs new file mode 100644 index 0000000000..0971ccdca0 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs @@ -0,0 +1,149 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components +{ + internal partial struct Block8x8F + { + /// + /// A number of rows of 8 scalar coefficients each in + /// + public const int RowCount = 8; + + [FieldOffset(0)] + public Vector256 V0; + [FieldOffset(32)] + public Vector256 V1; + [FieldOffset(64)] + public Vector256 V2; + [FieldOffset(96)] + public Vector256 V3; + [FieldOffset(128)] + public Vector256 V4; + [FieldOffset(160)] + public Vector256 V5; + [FieldOffset(192)] + public Vector256 V6; + [FieldOffset(224)] + public Vector256 V7; + + private static readonly Vector256 MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); + + private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + { + DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); + + ref Vector256 aBase = ref a.V0; + ref Vector256 bBase = ref b.V0; + + ref Vector256 destRef = ref dest.V01; + + for (nint i = 0; i < 8; i += 2) + { + Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + + Vector256 row = Avx2.PackSignedSaturate(row0, row1); + row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16(); + + Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row; + } + } + + private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + { + DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!"); + + ref Vector128 aBase = ref Unsafe.As>(ref a); + ref Vector128 bBase = ref Unsafe.As>(ref b); + + ref Vector128 destBase = ref Unsafe.As>(ref dest); + + for (int i = 0; i < 16; i += 2) + { + Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); + Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + + Vector128 row = Sse2.PackSignedSaturate(left, right); + Unsafe.Add(ref destBase, (IntPtr)((uint)i / 2)) = row; + } + } + + private void TransposeInplace_Avx() + { + // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 + Vector256 r0 = Avx.InsertVector128( + this.V0, + Unsafe.As>(ref this.V4L), + 1); + + Vector256 r1 = Avx.InsertVector128( + this.V1, + Unsafe.As>(ref this.V5L), + 1); + + Vector256 r2 = Avx.InsertVector128( + this.V2, + Unsafe.As>(ref this.V6L), + 1); + + Vector256 r3 = Avx.InsertVector128( + this.V3, + Unsafe.As>(ref this.V7L), + 1); + + Vector256 r4 = Avx.InsertVector128( + Unsafe.As>(ref this.V0R).ToVector256(), + Unsafe.As>(ref this.V4R), + 1); + + Vector256 r5 = Avx.InsertVector128( + Unsafe.As>(ref this.V1R).ToVector256(), + Unsafe.As>(ref this.V5R), + 1); + + Vector256 r6 = Avx.InsertVector128( + Unsafe.As>(ref this.V2R).ToVector256(), + Unsafe.As>(ref this.V6R), + 1); + + Vector256 r7 = Avx.InsertVector128( + Unsafe.As>(ref this.V3R).ToVector256(), + Unsafe.As>(ref this.V7R), + 1); + + Vector256 t0 = Avx.UnpackLow(r0, r1); + Vector256 t2 = Avx.UnpackLow(r2, r3); + Vector256 v = Avx.Shuffle(t0, t2, 0x4E); + this.V0 = Avx.Blend(t0, v, 0xCC); + this.V1 = Avx.Blend(t2, v, 0x33); + + Vector256 t4 = Avx.UnpackLow(r4, r5); + Vector256 t6 = Avx.UnpackLow(r6, r7); + v = Avx.Shuffle(t4, t6, 0x4E); + this.V4 = Avx.Blend(t4, v, 0xCC); + this.V5 = Avx.Blend(t6, v, 0x33); + + Vector256 t1 = Avx.UnpackHigh(r0, r1); + Vector256 t3 = Avx.UnpackHigh(r2, r3); + v = Avx.Shuffle(t1, t3, 0x4E); + this.V2 = Avx.Blend(t1, v, 0xCC); + this.V3 = Avx.Blend(t3, v, 0x33); + + Vector256 t5 = Avx.UnpackHigh(r4, r5); + Vector256 t7 = Avx.UnpackHigh(r6, r7); + v = Avx.Shuffle(t5, t7, 0x4E); + this.V6 = Avx.Blend(t5, v, 0xCC); + this.V7 = Avx.Blend(t7, v, 0x33); + } + } +} +#endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs index 23cf4ce4a9..498fe4d03b 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs @@ -1,4 +1,4 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System.Numerics; diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index f669a7ad9a..02f5a13244 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -16,7 +16,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components { /// - /// Represents a Jpeg block with coefficients. + /// 8x8 matrix of coefficients. /// [StructLayout(LayoutKind.Explicit)] internal partial struct Block8x8F : IEquatable @@ -66,30 +66,6 @@ internal partial struct Block8x8F : IEquatable public Vector4 V7L; [FieldOffset(240)] public Vector4 V7R; - -#if SUPPORTS_RUNTIME_INTRINSICS - /// - /// A number of rows of 8 scalar coefficients each in - /// - public const int RowCount = 8; - - [FieldOffset(0)] - public Vector256 V0; - [FieldOffset(32)] - public Vector256 V1; - [FieldOffset(64)] - public Vector256 V2; - [FieldOffset(96)] - public Vector256 V3; - [FieldOffset(128)] - public Vector256 V4; - [FieldOffset(160)] - public Vector256 V5; - [FieldOffset(192)] - public Vector256 V6; - [FieldOffset(224)] - public Vector256 V7; -#endif #pragma warning restore SA1600 // ElementsMustBeDocumented /// @@ -102,7 +78,7 @@ public float this[int idx] [MethodImpl(MethodImplOptions.AggressiveInlining)] get { - GuardBlockIndex(idx); + DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx)); ref float selfRef = ref Unsafe.As(ref this); return Unsafe.Add(ref selfRef, (nint)(uint)idx); } @@ -110,7 +86,7 @@ public float this[int idx] [MethodImpl(MethodImplOptions.AggressiveInlining)] set { - GuardBlockIndex(idx); + DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx)); ref float selfRef = ref Unsafe.As(ref this); Unsafe.Add(ref selfRef, (nint)(uint)idx) = value; } @@ -188,13 +164,6 @@ public static Block8x8F Load(Span data) return result; } - /// - /// Fill the block with defaults (zeroes). - /// - [MethodImpl(InliningOptions.ShortMethod)] - public void Clear() - => this = default; // The cheapest way to do this in C#: - /// /// Load raw 32bit floating point data from source. /// @@ -302,7 +271,7 @@ public unsafe void ScaledCopyTo(Span dest) public float[] ToArray() { - var result = new float[Size]; + float[] result = new float[Size]; this.ScaledCopyTo(result); return result; } @@ -434,102 +403,37 @@ public void AddInPlace(float value) } /// - /// Quantize the block. + /// Quantize input block, apply zig-zag ordering and store result as 16bit integers. /// - /// The block pointer. - /// The qt pointer. - /// Unzig pointer - public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr) - { - float* b = (float*)blockPtr; - float* qtp = (float*)qtPtr; - for (int qtIndex = 0; qtIndex < Size; qtIndex++) - { - byte blockIndex = unzigPtr[qtIndex]; - float* unzigPos = b + blockIndex; - - float val = *unzigPos; - val *= qtp[qtIndex]; - *unzigPos = val; - } - } - - /// - /// Quantize 'block' into 'dest' using the 'qt' quantization table: - /// Unzig the elements of block into dest, while dividing them by elements of qt and "pre-rounding" the values. - /// To finish the rounding it's enough to (int)-cast these values. - /// - /// Source block - /// Destination block - /// The quantization table - /// The 8x8 Unzig block. - public static unsafe void Quantize( - ref Block8x8F block, - ref Block8x8F dest, - ref Block8x8F qt, - ref ZigZag unZig) + /// Source block. + /// Destination block. + /// The quantization table. + public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt) { - for (int zig = 0; zig < Size; zig++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) { - dest[zig] = block[unZig[zig]]; + MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest); + ZigZag.ApplyZigZagOrderingAvx2(ref dest); } - - DivideRoundAll(ref dest, ref qt); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) + else if (Ssse3.IsSupported) { - var vnegOne = Vector256.Create(-1f); - var vadd = Vector256.Create(.5F); - var vone = Vector256.Create(1f); - - for (int i = 0; i < RowCount; i++) - { - ref Vector256 aRow = ref Unsafe.Add(ref a.V0, i); - ref Vector256 bRow = ref Unsafe.Add(ref b.V0, i); - Vector256 voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd); - aRow = Avx.Add(Avx.Divide(aRow, bRow), voff); - } + MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest); + ZigZag.ApplyZigZagOrderingSsse3(ref dest); } else #endif { - a.V0L = DivideRound(a.V0L, b.V0L); - a.V0R = DivideRound(a.V0R, b.V0R); - a.V1L = DivideRound(a.V1L, b.V1L); - a.V1R = DivideRound(a.V1R, b.V1R); - a.V2L = DivideRound(a.V2L, b.V2L); - a.V2R = DivideRound(a.V2R, b.V2R); - a.V3L = DivideRound(a.V3L, b.V3L); - a.V3R = DivideRound(a.V3R, b.V3R); - a.V4L = DivideRound(a.V4L, b.V4L); - a.V4R = DivideRound(a.V4R, b.V4R); - a.V5L = DivideRound(a.V5L, b.V5L); - a.V5R = DivideRound(a.V5R, b.V5R); - a.V6L = DivideRound(a.V6L, b.V6L); - a.V6R = DivideRound(a.V6R, b.V6R); - a.V7L = DivideRound(a.V7L, b.V7L); - a.V7R = DivideRound(a.V7R, b.V7R); + for (int i = 0; i < Size; i++) + { + int idx = ZigZag.ZigZagOrder[i]; + float quantizedVal = block[idx] * qt[idx]; + quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f; + dest[i] = (short)quantizedVal; + } } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor) - { - var neg = new Vector4(-1); - var add = new Vector4(.5F); - - // sign(dividend) = max(min(dividend, 1), -1) - Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One); - - // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend) - return (dividend / divisor) + (sign * add); - } - public void RoundInto(ref Block8x8 dest) { for (int i = 0; i < Size; i++) @@ -627,6 +531,47 @@ public void LoadFromInt16ExtendedAvx2(ref Block8x8 source) Unsafe.Add(ref dRef, 7) = bottom; } + /// + /// Compares entire 8x8 block to a single scalar value. + /// + /// Value to compare to. + public bool EqualsToScalar(int value) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); + + var targetVector = Vector256.Create(value); + ref Vector256 blockStride = ref this.V0; + + for (int i = 0; i < RowCount; i++) + { + Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector); + if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask) + { + return false; + } + } + + return true; + } +#endif + { + ref float scalars = ref Unsafe.As(ref this); + + for (int i = 0; i < Size; i++) + { + if ((int)Unsafe.Add(ref scalars, i) != value) + { + return false; + } + } + + return true; + } + } + /// public bool Equals(Block8x8F other) => this.V0L == other.V0L @@ -663,213 +608,89 @@ public override string ToString() return sb.ToString(); } - [MethodImpl(InliningOptions.ShortMethod)] - private static Vector NormalizeAndRound(Vector row, Vector off, Vector max) - { - row += off; - row = Vector.Max(row, Vector.Zero); - row = Vector.Min(row, max); - return row.FastRound(); - } - - [Conditional("DEBUG")] - private static void GuardBlockIndex(int idx) - { - DebugGuard.MustBeLessThan(idx, Size, nameof(idx)); - DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx)); - } - /// - /// Transpose the block into the destination block. + /// Transpose the block inplace. /// - /// The destination block [MethodImpl(InliningOptions.ShortMethod)] - public void TransposeInto(ref Block8x8F d) + public void TransposeInplace() { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { - // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 - Vector256 r0 = Avx.InsertVector128( - Unsafe.As>(ref this.V0L).ToVector256(), - Unsafe.As>(ref this.V4L), - 1); - - Vector256 r1 = Avx.InsertVector128( - Unsafe.As>(ref this.V1L).ToVector256(), - Unsafe.As>(ref this.V5L), - 1); - - Vector256 r2 = Avx.InsertVector128( - Unsafe.As>(ref this.V2L).ToVector256(), - Unsafe.As>(ref this.V6L), - 1); - - Vector256 r3 = Avx.InsertVector128( - Unsafe.As>(ref this.V3L).ToVector256(), - Unsafe.As>(ref this.V7L), - 1); - - Vector256 r4 = Avx.InsertVector128( - Unsafe.As>(ref this.V0R).ToVector256(), - Unsafe.As>(ref this.V4R), - 1); - - Vector256 r5 = Avx.InsertVector128( - Unsafe.As>(ref this.V1R).ToVector256(), - Unsafe.As>(ref this.V5R), - 1); - - Vector256 r6 = Avx.InsertVector128( - Unsafe.As>(ref this.V2R).ToVector256(), - Unsafe.As>(ref this.V6R), - 1); - - Vector256 r7 = Avx.InsertVector128( - Unsafe.As>(ref this.V3R).ToVector256(), - Unsafe.As>(ref this.V7R), - 1); - - Vector256 t0 = Avx.UnpackLow(r0, r1); - Vector256 t2 = Avx.UnpackLow(r2, r3); - Vector256 v = Avx.Shuffle(t0, t2, 0x4E); - d.V0 = Avx.Blend(t0, v, 0xCC); - d.V1 = Avx.Blend(t2, v, 0x33); - - Vector256 t4 = Avx.UnpackLow(r4, r5); - Vector256 t6 = Avx.UnpackLow(r6, r7); - v = Avx.Shuffle(t4, t6, 0x4E); - d.V4 = Avx.Blend(t4, v, 0xCC); - d.V5 = Avx.Blend(t6, v, 0x33); - - Vector256 t1 = Avx.UnpackHigh(r0, r1); - Vector256 t3 = Avx.UnpackHigh(r2, r3); - v = Avx.Shuffle(t1, t3, 0x4E); - d.V2 = Avx.Blend(t1, v, 0xCC); - d.V3 = Avx.Blend(t3, v, 0x33); - - Vector256 t5 = Avx.UnpackHigh(r4, r5); - Vector256 t7 = Avx.UnpackHigh(r6, r7); - v = Avx.Shuffle(t5, t7, 0x4E); - d.V6 = Avx.Blend(t5, v, 0xCC); - d.V7 = Avx.Blend(t7, v, 0x33); + this.TransposeInplace_Avx(); } else #endif { - d.V0L.X = this.V0L.X; - d.V1L.X = this.V0L.Y; - d.V2L.X = this.V0L.Z; - d.V3L.X = this.V0L.W; - d.V4L.X = this.V0R.X; - d.V5L.X = this.V0R.Y; - d.V6L.X = this.V0R.Z; - d.V7L.X = this.V0R.W; - - d.V0L.Y = this.V1L.X; - d.V1L.Y = this.V1L.Y; - d.V2L.Y = this.V1L.Z; - d.V3L.Y = this.V1L.W; - d.V4L.Y = this.V1R.X; - d.V5L.Y = this.V1R.Y; - d.V6L.Y = this.V1R.Z; - d.V7L.Y = this.V1R.W; - - d.V0L.Z = this.V2L.X; - d.V1L.Z = this.V2L.Y; - d.V2L.Z = this.V2L.Z; - d.V3L.Z = this.V2L.W; - d.V4L.Z = this.V2R.X; - d.V5L.Z = this.V2R.Y; - d.V6L.Z = this.V2R.Z; - d.V7L.Z = this.V2R.W; - - d.V0L.W = this.V3L.X; - d.V1L.W = this.V3L.Y; - d.V2L.W = this.V3L.Z; - d.V3L.W = this.V3L.W; - d.V4L.W = this.V3R.X; - d.V5L.W = this.V3R.Y; - d.V6L.W = this.V3R.Z; - d.V7L.W = this.V3R.W; - - d.V0R.X = this.V4L.X; - d.V1R.X = this.V4L.Y; - d.V2R.X = this.V4L.Z; - d.V3R.X = this.V4L.W; - d.V4R.X = this.V4R.X; - d.V5R.X = this.V4R.Y; - d.V6R.X = this.V4R.Z; - d.V7R.X = this.V4R.W; - - d.V0R.Y = this.V5L.X; - d.V1R.Y = this.V5L.Y; - d.V2R.Y = this.V5L.Z; - d.V3R.Y = this.V5L.W; - d.V4R.Y = this.V5R.X; - d.V5R.Y = this.V5R.Y; - d.V6R.Y = this.V5R.Z; - d.V7R.Y = this.V5R.W; - - d.V0R.Z = this.V6L.X; - d.V1R.Z = this.V6L.Y; - d.V2R.Z = this.V6L.Z; - d.V3R.Z = this.V6L.W; - d.V4R.Z = this.V6R.X; - d.V5R.Z = this.V6R.Y; - d.V6R.Z = this.V6R.Z; - d.V7R.Z = this.V6R.W; - - d.V0R.W = this.V7L.X; - d.V1R.W = this.V7L.Y; - d.V2R.W = this.V7L.Z; - d.V3R.W = this.V7L.W; - d.V4R.W = this.V7R.X; - d.V5R.W = this.V7R.Y; - d.V6R.W = this.V7R.Z; - d.V7R.W = this.V7R.W; + this.TransposeInplace_Scalar(); } } /// - /// Compares entire 8x8 block to a single scalar value. + /// Scalar inplace transpose implementation for /// - /// Value to compare to. - public bool EqualsToScalar(int value) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) + [MethodImpl(InliningOptions.ShortMethod)] + private void TransposeInplace_Scalar() + { + ref float elemRef = ref Unsafe.As(ref this); + + // row #0 + Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8)); + Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16)); + Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24)); + Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32)); + Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40)); + Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48)); + Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56)); + + // row #1 + Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17)); + Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25)); + Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33)); + Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41)); + Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49)); + Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57)); + + // row #2 + Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26)); + Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34)); + Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42)); + Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50)); + Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58)); + + // row #3 + Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35)); + Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43)); + Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51)); + Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59)); + + // row #4 + Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44)); + Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52)); + Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60)); + + // row #5 + Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53)); + Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61)); + + // row #6 + Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62)); + + static void Swap(ref float a, ref float b) { - const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); - - var targetVector = Vector256.Create(value); - ref Vector256 blockStride = ref this.V0; - - for (int i = 0; i < RowCount; i++) - { - Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector); - if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask) - { - return false; - } - } - - return true; + float tmp = a; + a = b; + b = tmp; } -#endif - { - ref float scalars = ref Unsafe.As(ref this); - - for (int i = 0; i < Size; i++) - { - if ((int)Unsafe.Add(ref scalars, i) != value) - { - return false; - } - } + } - return true; - } + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector NormalizeAndRound(Vector row, Vector off, Vector max) + { + row += off; + row = Vector.Max(row, Vector.Zero); + row = Vector.Min(row, max); + return row.FastRound(); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs index b5a51c5a4a..bc9a53ea04 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs @@ -58,11 +58,6 @@ internal class HuffmanScanDecoder /// private readonly HuffmanTable[] acHuffmanTables; - /// - /// The unzig data. - /// - private ZigZag dctZigZag; - private HuffmanScanBuffer scanBuffer; private readonly SpectralConverter spectralConverter; @@ -80,7 +75,6 @@ public HuffmanScanDecoder( SpectralConverter converter, CancellationToken cancellationToken) { - this.dctZigZag = ZigZag.CreateUnzigTable(); this.stream = stream; this.spectralConverter = converter; this.cancellationToken = cancellationToken; @@ -483,7 +477,6 @@ private void DecodeBlockBaseline( { ref short blockDataRef = ref Unsafe.As(ref block); ref HuffmanScanBuffer buffer = ref this.scanBuffer; - ref ZigZag zigzag = ref this.dctZigZag; // DC int t = buffer.DecodeHuffman(ref dcTable); @@ -508,7 +501,7 @@ private void DecodeBlockBaseline( { i += r; s = buffer.Receive(s); - Unsafe.Add(ref blockDataRef, zigzag[i++]) = (short)s; + Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s; } else { @@ -562,7 +555,6 @@ private void DecodeBlockProgressiveAC(ref Block8x8 block, ref HuffmanTable acTab } ref HuffmanScanBuffer buffer = ref this.scanBuffer; - ref ZigZag zigzag = ref this.dctZigZag; int start = this.SpectralStart; int end = this.SpectralEnd; int low = this.SuccessiveLow; @@ -578,7 +570,7 @@ private void DecodeBlockProgressiveAC(ref Block8x8 block, ref HuffmanTable acTab if (s != 0) { s = buffer.Receive(s); - Unsafe.Add(ref blockDataRef, zigzag[i]) = (short)(s << low); + Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low); } else { @@ -608,7 +600,6 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman { // Refinement scan for these AC coefficients ref HuffmanScanBuffer buffer = ref this.scanBuffer; - ref ZigZag zigzag = ref this.dctZigZag; int start = this.SpectralStart; int end = this.SpectralEnd; @@ -655,7 +646,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman do { - ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]); + ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]); if (coef != 0) { buffer.CheckBits(); @@ -681,7 +672,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman if ((s != 0) && (k < 64)) { - Unsafe.Add(ref blockDataRef, zigzag[k]) = (short)s; + Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s; } } } @@ -690,7 +681,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman { for (; k <= end; k++) { - ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]); + ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]); if (coef != 0) { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs index 391dac784f..0b80acc5dc 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs @@ -22,7 +22,7 @@ internal interface IRawJpegData : IDisposable IJpegComponent[] Components { get; } /// - /// Gets the quantization tables, in zigzag order. + /// Gets the quantization tables, in natural order. /// Block8x8F[] QuantizationTables { get; } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs index 7cfbaddcc1..085cd4a291 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs @@ -19,14 +19,9 @@ internal struct JpegBlockPostProcessor public Block8x8F SourceBlock; /// - /// Temporal block 1 to store intermediate and/or final computation results. + /// Temporal block to store intermediate computation results. /// - public Block8x8F WorkspaceBlock1; - - /// - /// Temporal block 2 to store intermediate and/or final computation results. - /// - public Block8x8F WorkspaceBlock2; + public Block8x8F WorkspaceBlock; /// /// The quantization table as . @@ -46,12 +41,11 @@ internal struct JpegBlockPostProcessor public JpegBlockPostProcessor(IRawJpegData decoder, IJpegComponent component) { int qtIndex = component.QuantizationTableIndex; - this.DequantiazationTable = ZigZag.CreateDequantizationTable(ref decoder.QuantizationTables[qtIndex]); + this.DequantiazationTable = decoder.QuantizationTables[qtIndex]; this.subSamplingDivisors = component.SubSamplingDivisors; this.SourceBlock = default; - this.WorkspaceBlock1 = default; - this.WorkspaceBlock2 = default; + this.WorkspaceBlock = default; } /// @@ -71,20 +65,20 @@ public void ProcessBlockColorsInto( int destAreaStride, float maximumValue) { - ref Block8x8F b = ref this.SourceBlock; - b.LoadFrom(ref sourceBlock); + ref Block8x8F block = ref this.SourceBlock; + block.LoadFrom(ref sourceBlock); // Dequantize: - b.MultiplyInPlace(ref this.DequantiazationTable); + block.MultiplyInPlace(ref this.DequantiazationTable); - FastFloatingPointDCT.TransformIDCT(ref b, ref this.WorkspaceBlock1, ref this.WorkspaceBlock2); + FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock); // To conform better to libjpeg we actually NEED TO loose precision here. // This is because they store blocks as Int16 between all the operations. // To be "more accurate", we need to emulate this by rounding! - this.WorkspaceBlock1.NormalizeColorsAndRoundInPlace(maximumValue); + block.NormalizeColorsAndRoundInPlace(maximumValue); - this.WorkspaceBlock1.ScaledCopyTo( + block.ScaledCopyTo( ref destAreaOrigin, destAreaStride, this.subSamplingDivisors.Width, diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs index 23bb01409c..e975b11fbb 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs @@ -39,6 +39,6 @@ internal abstract class SpectralConverter /// The jpeg frame with the color space to convert to. /// The raw JPEG data. /// The color converter. - public virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision); + protected virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision); } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs index ec77bf87db..44b39dfd71 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs @@ -5,10 +5,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { /// /// A compiled look-up table representation of a huffmanSpec. - /// Each value maps to a int32 of which the 24 most significant bits hold the - /// codeword in bits and the 8 least significant bits hold the codeword size. /// The maximum codeword size is 16 bits. /// + /// + /// + /// Each value maps to a int32 of which the 24 most significant bits hold the + /// codeword in bits and the 8 least significant bits hold the codeword size. + /// + /// + /// Code value occupies 24 most significant bits as integer value. + /// This value is shifted to the MSB position for performance reasons. + /// For example, decimal value 10 is stored like this: + /// + /// MSB LSB + /// 1010 0000 00000000 00000000 | 00000100 + /// + /// This was done to eliminate extra binary shifts in the encoder. + /// While code length is represented as 8 bit integer value + /// + /// internal readonly struct HuffmanLut { /// @@ -54,7 +69,7 @@ public HuffmanLut(HuffmanSpec spec) int len = i + 1; for (int j = 0; j < spec.Count[i]; j++) { - this.Values[spec.Values[k]] = len | (code << 8); + this.Values[spec.Values[k]] = len | (code << (32 - len)); code++; k++; } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs index 4b74400cac..b3cdbf0a05 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -1,12 +1,11 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using System.IO; +using System.Numerics; using System.Runtime.CompilerServices; -#if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; -#endif +using System.Runtime.InteropServices; using System.Threading; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -16,49 +15,118 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder internal class HuffmanScanEncoder { /// - /// Compiled huffman tree to encode given values. + /// Maximum number of bytes encoded jpeg 8x8 block can occupy. + /// It's highly unlikely for block to occupy this much space - it's a theoretical limit. /// - /// Yields codewords by index consisting of [run length | bitsize]. - private HuffmanLut[] huffmanTables; + /// + /// Where 16 is maximum huffman code binary length according to itu + /// specs. 10 is maximum value binary length, value comes from discrete + /// cosine tranform with value range: [-1024..1023]. Block stores + /// 8x8 = 64 values thus multiplication by 64. Then divided by 8 to get + /// the number of bytes. This value is then multiplied by + /// for performance reasons. + /// + private const int MaxBytesPerBlock = (16 + 10) * 64 / 8 * MaxBytesPerBlockMultiplier; /// - /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count). + /// Multiplier used within cache buffers size calculation. /// /// - /// This is subject to change, 1024 seems to be the best value in terms of performance. - /// expects it to be at least 8 (see comments in method body). + /// + /// Theoretically, bytes buffer can fit + /// exactly one minimal coding unit. In reality, coding blocks occupy much + /// less space than the theoretical maximum - this can be exploited. + /// If temporal buffer size is multiplied by at least 2, second half of + /// the resulting buffer will be used as an overflow 'guard' if next + /// block would occupy maximum number of bytes. While first half may fit + /// many blocks before needing to flush. + /// + /// + /// This is subject to change. This can be equal to 1 but recomended + /// value is 2 or even greater - futher benchmarking needed. + /// /// - private const int EmitBufferSizeInBytes = 1024; + private const int MaxBytesPerBlockMultiplier = 2; /// - /// A buffer for reducing the number of stream writes when emitting Huffman tables. + /// size multiplier. /// - private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes]; + /// + /// Jpeg specification requiers to insert 'stuff' bytes after each + /// 0xff byte value. Worst case scenarion is when all bytes are 0xff. + /// While it's highly unlikely (if not impossible) to get such + /// combination, it's theoretically possible so buffer size must be guarded. + /// + private const int OutputBufferLengthMultiplier = 2; /// - /// Number of filled bytes in buffer + /// Compiled huffman tree to encode given values. /// - private int emitLen = 0; + /// Yields codewords by index consisting of [run length | bitsize]. + private HuffmanLut[] huffmanTables; /// /// Emitted bits 'micro buffer' before being transferred to the . /// - private int accumulatedBits; + private uint accumulatedBits; + + /// + /// Buffer for temporal storage of huffman rle encoding bit data. + /// + /// + /// Encoding bits are assembled to 4 byte unsigned integers and then copied to this buffer. + /// This process does NOT include inserting stuff bytes. + /// + private readonly uint[] emitBuffer; + + /// + /// Buffer for temporal storage which is then written to the output stream. + /// + /// + /// Encoding bits from are copied to this byte buffer including stuff bytes. + /// + private readonly byte[] streamWriteBuffer; /// /// Number of jagged bits stored in /// private int bitCount; - private Block8x8F temporalBlock1; - private Block8x8F temporalBlock2; + private int emitWriteIndex; + + private Block8x8 tempBlock; /// /// The output stream. All attempted writes after the first error become no-ops. /// private readonly Stream target; - public HuffmanScanEncoder(Stream outputStream) => this.target = outputStream; + /// + /// Initializes a new instance of the class. + /// + /// Amount of encoded 8x8 blocks per single jpeg macroblock. + /// Output stream for saving encoded data. + public HuffmanScanEncoder(int blocksPerCodingUnit, Stream outputStream) + { + int emitBufferByteLength = MaxBytesPerBlock * blocksPerCodingUnit; + this.emitBuffer = new uint[emitBufferByteLength / sizeof(uint)]; + this.emitWriteIndex = this.emitBuffer.Length; + + this.streamWriteBuffer = new byte[emitBufferByteLength * OutputBufferLengthMultiplier]; + + this.target = outputStream; + } + + /// + /// Gets a value indicating whether is full + /// and must be flushed using + /// before encoding next 8x8 coding block. + /// + private bool IsStreamFlushNeeded + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get => this.emitWriteIndex < (uint)this.emitBuffer.Length / 2; + } /// /// Encodes the image with no subsampling. @@ -71,9 +139,10 @@ internal class HuffmanScanEncoder public void Encode444(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - this.huffmanTables = HuffmanLut.TheHuffmanLut; + FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable); + FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable); - var unzig = ZigZag.CreateUnzigTable(); + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; @@ -97,26 +166,28 @@ public void Encode444(Image pixels, ref Block8x8F luminanceQuant QuantIndex.Luminance, prevDCY, ref pixelConverter.Y, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); prevDCCb = this.WriteBlock( QuantIndex.Chrominance, prevDCCb, ref pixelConverter.Cb, - ref chrominanceQuantTable, - ref unzig); + ref chrominanceQuantTable); prevDCCr = this.WriteBlock( QuantIndex.Chrominance, prevDCCr, ref pixelConverter.Cr, - ref chrominanceQuantTable, - ref unzig); + ref chrominanceQuantTable); + + if (this.IsStreamFlushNeeded) + { + this.FlushToStream(); + } } } - this.FlushInternalBuffer(); + this.FlushRemainingBytes(); } /// @@ -131,9 +202,10 @@ public void Encode444(Image pixels, ref Block8x8F luminanceQuant public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - this.huffmanTables = HuffmanLut.TheHuffmanLut; + FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable); + FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable); - var unzig = ZigZag.CreateUnzigTable(); + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; @@ -158,34 +230,35 @@ public void Encode420(Image pixels, ref Block8x8F luminanceQuant QuantIndex.Luminance, prevDCY, ref pixelConverter.YLeft, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); prevDCY = this.WriteBlock( QuantIndex.Luminance, prevDCY, ref pixelConverter.YRight, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); } prevDCCb = this.WriteBlock( QuantIndex.Chrominance, prevDCCb, ref pixelConverter.Cb, - ref chrominanceQuantTable, - ref unzig); + ref chrominanceQuantTable); prevDCCr = this.WriteBlock( QuantIndex.Chrominance, prevDCCr, ref pixelConverter.Cr, - ref chrominanceQuantTable, - ref unzig); + ref chrominanceQuantTable); + + if (this.IsStreamFlushNeeded) + { + this.FlushToStream(); + } } } - this.FlushInternalBuffer(); + this.FlushRemainingBytes(); } /// @@ -198,9 +271,9 @@ public void Encode420(Image pixels, ref Block8x8F luminanceQuant public void EncodeGrayscale(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - this.huffmanTables = HuffmanLut.TheHuffmanLut; + FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable); - var unzig = ZigZag.CreateUnzigTable(); + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming int prevDCY = 0; @@ -223,12 +296,16 @@ public void EncodeGrayscale(Image pixels, ref Block8x8F luminanc QuantIndex.Luminance, prevDCY, ref pixelConverter.Y, - ref luminanceQuantTable, - ref unzig); + ref luminanceQuantTable); + + if (this.IsStreamFlushNeeded) + { + this.FlushToStream(); + } } } - this.FlushInternalBuffer(); + this.FlushRemainingBytes(); } /// @@ -236,14 +313,14 @@ public void EncodeGrayscale(Image pixels, ref Block8x8F luminanc /// /// The pixel format. /// The pixel accessor providing access to the image pixels. - /// Luminance quantization table provided by the callee. + /// Quantization table provided by the callee. /// The token to monitor for cancellation. - public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) + public void EncodeRgb(Image pixels, ref Block8x8F quantTable, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - this.huffmanTables = HuffmanLut.TheHuffmanLut; + FastFloatingPointDCT.AdjustToFDCT(ref quantTable); - var unzig = ZigZag.CreateUnzigTable(); + this.huffmanTables = HuffmanLut.TheHuffmanLut; // ReSharper disable once InconsistentNaming int prevDCR = 0, prevDCG = 0, prevDCB = 0; @@ -267,26 +344,28 @@ public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuant QuantIndex.Luminance, prevDCR, ref pixelConverter.R, - ref luminanceQuantTable, - ref unzig); + ref quantTable); prevDCG = this.WriteBlock( QuantIndex.Luminance, prevDCG, ref pixelConverter.G, - ref luminanceQuantTable, - ref unzig); + ref quantTable); prevDCB = this.WriteBlock( QuantIndex.Luminance, prevDCB, ref pixelConverter.B, - ref luminanceQuantTable, - ref unzig); + ref quantTable); + + if (this.IsStreamFlushNeeded) + { + this.FlushToStream(); + } } } - this.FlushInternalBuffer(); + this.FlushRemainingBytes(); } /// @@ -296,47 +375,53 @@ public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuant /// /// The quantization table index. /// The previous DC value. - /// Source block - /// Quantization table - /// The 8x8 Unzig block. + /// Source block. + /// Quantization table. /// The . private int WriteBlock( QuantIndex index, int prevDC, - ref Block8x8F src, - ref Block8x8F quant, - ref ZigZag unZig) + ref Block8x8F block, + ref Block8x8F quant) { - ref Block8x8F refTemp1 = ref this.temporalBlock1; - ref Block8x8F refTemp2 = ref this.temporalBlock2; + ref Block8x8 spectralBlock = ref this.tempBlock; - FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2); + // Shifting level from 0..255 to -128..127 + block.AddInPlace(-128f); - Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig); + // Discrete cosine transform + FastFloatingPointDCT.TransformFDCT(ref block); + + // Quantization + Block8x8F.Quantize(ref block, ref spectralBlock, ref quant); // Emit the DC delta. - int dc = (int)refTemp2[0]; - this.EmitDirectCurrentTerm(this.huffmanTables[2 * (int)index].Values, dc - prevDC); + int dc = spectralBlock[0]; + this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC); // Emit the AC components. int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values; + nint lastValuableIndex = spectralBlock.GetLastNonZeroIndex(); + int runLength = 0; - int lastValuableIndex = GetLastValuableElementIndex(ref refTemp2); - for (int zig = 1; zig <= lastValuableIndex; zig++) + ref short blockRef = ref Unsafe.As(ref spectralBlock); + for (nint zig = 1; zig <= lastValuableIndex; zig++) { - int ac = (int)refTemp2[zig]; + const int zeroRun1 = 1 << 4; + const int zeroRun16 = 16 << 4; + int ac = Unsafe.Add(ref blockRef, zig); if (ac == 0) { - runLength++; + runLength += zeroRun1; } else { - while (runLength > 15) + while (runLength >= zeroRun16) { this.EmitHuff(acHuffTable, 0xf0); - runLength -= 16; + runLength -= zeroRun16; } this.EmitHuffRLE(acHuffTable, runLength, ac); @@ -356,100 +441,89 @@ private int WriteBlock( } /// - /// Emits the least significant count of bits to the stream write buffer. - /// The precondition is bits - /// - /// < 1<<nBits && nBits <= 16 - /// - /// . + /// Emits the most significant count of bits to the buffer. /// - /// The packed bits. - /// The number of bits + /// + /// + /// Supports up to 32 count of bits but, generally speaking, jpeg + /// standard assures that there won't be more than 16 bits per single + /// value. + /// + /// + /// Emitting algorithm uses 3 intermediate buffers for caching before + /// writing to the stream: + /// + /// + /// uint32 + /// + /// Bit buffer. Encoded spectral values can occupy up to 16 bits, bits + /// are assembled to whole bytes via this intermediate buffer. + /// + /// + /// + /// uint32[] + /// + /// Assembled bytes from uint32 buffer are saved into this buffer. + /// uint32 buffer values are saved using indices from the last to the first. + /// As bytes are saved to the memory as 4-byte packages endianness matters: + /// Jpeg stream is big-endian, indexing buffer bytes from the last index to the + /// first eliminates all operations to extract separate bytes. This only works for + /// little-endian machines (there are no known examples of big-endian users atm). + /// For big-endians this approach is slower due to the separate byte extraction. + /// + /// + /// + /// byte[] + /// + /// Byte buffer used only during method. + /// + /// + /// + /// + /// + /// Bits to emit, must be shifted to the left. + /// Bits count stored in the bits parameter. [MethodImpl(InliningOptions.ShortMethod)] - private void Emit(int bits, int count) + private void Emit(uint bits, int count) { + this.accumulatedBits |= bits >> this.bitCount; + count += this.bitCount; - bits <<= 32 - count; - bits |= this.accumulatedBits; - // Only write if more than 8 bits. - if (count >= 8) + if (count >= 32) { - // Track length - while (count >= 8) - { - byte b = (byte)(bits >> 24); - this.emitBuffer[this.emitLen++] = b; - - // Adding stuff byte - // This is because by JPEG standard scan data can contain JPEG markers (indicated by the 0xFF byte, followed by a non-zero byte) - // Considering this every 0xFF byte must be followed by 0x00 padding byte to signal that this is not a marker - if (b == byte.MaxValue) - { - this.emitBuffer[this.emitLen++] = byte.MinValue; - } - - bits <<= 8; - count -= 8; - } + this.emitBuffer[--this.emitWriteIndex] = this.accumulatedBits; + this.accumulatedBits = bits << (32 - this.bitCount); - // This can emit 4 times of: - // 1 byte guaranteed - // 1 extra byte.MinValue byte if previous one was byte.MaxValue - // Thus writing (1 + 1) * 4 = 8 bytes max - // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write - if (this.emitLen > EmitBufferSizeInBytes - 8) - { - this.target.Write(this.emitBuffer, 0, this.emitLen); - this.emitLen = 0; - } + count -= 32; } - this.accumulatedBits = bits; this.bitCount = count; } /// - /// Emits the given value with the given Huffman encoder. + /// Emits the given value with the given Huffman table. /// - /// Compiled Huffman spec values. - /// The value to encode. + /// Huffman table. + /// Value to encode. [MethodImpl(InliningOptions.ShortMethod)] private void EmitHuff(int[] table, int value) { int x = table[value]; - this.Emit(x >> 8, x & 0xff); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private void EmitDirectCurrentTerm(int[] table, int value) - { - int a = value; - int b = value; - if (a < 0) - { - a = -value; - b = value - 1; - } - - int bt = GetHuffmanEncodingLength((uint)a); - - this.EmitHuff(table, bt); - if (bt > 0) - { - this.Emit(b & ((1 << bt) - 1), bt); - } + this.Emit((uint)x & 0xffff_ff00u, x & 0xff); } /// - /// Emits a run of runLength copies of value encoded with the given Huffman encoder. + /// Emits given value via huffman rle encoding. /// - /// Compiled Huffman spec values. - /// The number of copies to encode. - /// The value to encode. + /// Huffman table. + /// The number of preceding zeroes, preshifted by 4 to the left. + /// Value to encode. [MethodImpl(InliningOptions.ShortMethod)] private void EmitHuffRLE(int[] table, int runLength, int value) { + DebugGuard.IsTrue((runLength & 0xf) == 0, $"{nameof(runLength)} parameter must be shifted to the left by 4 bits"); + int a = value; int b = value; if (a < 0) @@ -458,25 +532,18 @@ private void EmitHuffRLE(int[] table, int runLength, int value) b = value - 1; } - int bt = GetHuffmanEncodingLength((uint)a); + int valueLen = GetHuffmanEncodingLength((uint)a); - this.EmitHuff(table, (runLength << 4) | bt); - this.Emit(b & ((1 << bt) - 1), bt); - } + // Huffman prefix code + int huffPackage = table[runLength | valueLen]; + int prefixLen = huffPackage & 0xff; + uint prefix = (uint)huffPackage & 0xffff_0000u; - /// - /// Writes remaining bytes from internal buffer to the target stream. - /// - /// Pads last byte with 1's if necessary - private void FlushInternalBuffer() - { - // pad last byte with 1's - int padBitsCount = 8 - (this.bitCount % 8); - if (padBitsCount != 0) - { - this.Emit((1 << padBitsCount) - 1, padBitsCount); - this.target.Write(this.emitBuffer, 0, this.emitLen); - } + // Actual encoded value + uint encodedValue = (uint)b << (32 - valueLen); + + // Doing two binary shifts to get rid of leading 1's in negative value case + this.Emit(prefix | (encodedValue >> prefixLen), prefixLen + valueLen); } /// @@ -498,7 +565,7 @@ internal static int GetHuffmanEncodingLength(uint value) // Lzcnt would return 32 for input value of 0 - no need to check that with branching // Fallback code if Lzcnt is not supported still use if-check // But most modern CPUs support this instruction so this should not be a problem - return 32 - System.Numerics.BitOperations.LeadingZeroCount(value); + return 32 - BitOperations.LeadingZeroCount(value); #else // Ideally: // if 0 - return 0 in this case @@ -515,65 +582,108 @@ internal static int GetHuffmanEncodingLength(uint value) } /// - /// Returns index of the last non-zero element in given mcu block. - /// If all values of the mcu block are zero, this method might return different results depending on the runtime and hardware support. - /// This is jpeg mcu specific code, mcu[0] stores a dc value which will be encoded outside of the loop. - /// This method is guaranteed to return either -1 or 0 if all elements are zero. + /// General method for flushing cached spectral data bytes to + /// the ouput stream respecting stuff bytes. /// /// - /// This is an internal operation supposed to be used only in class for jpeg encoding. + /// Bytes cached via are stored in 4-bytes blocks + /// which makes this method endianness dependent. /// - /// Mcu block. - /// Index of the last non-zero element. [MethodImpl(InliningOptions.ShortMethod)] - internal static int GetLastValuableElementIndex(ref Block8x8F mcu) + private void FlushToStream(int endIndex) { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111); + Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan()); - Vector256 zero8 = Vector256.Zero; + int writeIdx = 0; + int startIndex = emitBytes.Length - 1; - ref Vector256 mcuStride = ref mcu.V0; - - for (int i = 7; i >= 0; i--) + // Some platforms may fail to eliminate this if-else branching + // Even if it happens - buffer is flushed in big packs, + // branching overhead shouldn't be noticeable + if (BitConverter.IsLittleEndian) + { + // For little endian case bytes are ordered and can be + // safely written to the stream with stuff bytes + // First byte is cached on the most significant index + // so we are going from the end of the array to its beginning: + // ... [ double word #1 ] [ double word #0 ] + // ... [idx3|idx2|idx1|idx0] [idx3|idx2|idx1|idx0] + for (int i = startIndex; i >= endIndex; i--) { - int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32(Unsafe.Add(ref mcuStride, i)), zero8).AsByte()); + byte value = emitBytes[i]; + this.streamWriteBuffer[writeIdx++] = value; - // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros - if (areEqual != equalityMask) + // Inserting stuff byte + if (value == 0xff) { - // last index in the stride, we go from the end to the start of the stride - int startIndex = i * 8; - int index = startIndex + 7; - ref float elemRef = ref Unsafe.As(ref mcu); - while (index >= startIndex && (int)Unsafe.Add(ref elemRef, index) == 0) - { - index--; - } - - // this implementation will return -1 if all ac components are zero and dc are zero - return index; + this.streamWriteBuffer[writeIdx++] = 0x00; } } - - return -1; } else -#endif { - int index = Block8x8F.Size - 1; - ref float elemRef = ref Unsafe.As(ref mcu); - - while (index > 0 && (int)Unsafe.Add(ref elemRef, index) == 0) + // For big endian case bytes are ordered in 4-byte packs + // which are ordered like bytes in the little endian case by in 4-byte packs: + // ... [ double word #1 ] [ double word #0 ] + // ... [idx0|idx1|idx2|idx3] [idx0|idx1|idx2|idx3] + // So we must write each 4-bytes in 'natural order' + for (int i = startIndex; i >= endIndex; i -= 4) { - index--; - } + // This loop is caused by the nature of underlying byte buffer + // implementation and indeed causes performace by somewhat 5% + // compared to little endian scenario + // Even with this performance drop this cached buffer implementation + // is faster than individually writing bytes using binary shifts and binary and(s) + for (int j = i - 3; j <= i; j++) + { + byte value = emitBytes[j]; + this.streamWriteBuffer[writeIdx++] = value; - // this implementation will return 0 if all ac components and dc are zero - return index; + // Inserting stuff byte + if (value == 0xff) + { + this.streamWriteBuffer[writeIdx++] = 0x00; + } + } + } } + + this.target.Write(this.streamWriteBuffer, 0, writeIdx); + } + + /// + /// Flushes spectral data bytes after encoding all channel blocks + /// in a single jpeg macroblock using . + /// + /// + /// This must be called only if is true + /// only during the macroblocks encoding routine. + /// + private void FlushToStream() + { + this.FlushToStream(this.emitWriteIndex * 4); + this.emitWriteIndex = this.emitBuffer.Length; + } + + /// + /// Flushes final cached bits to the stream padding 1's to + /// complement full bytes. + /// + /// + /// This must be called only once at the end of the encoding routine. + /// check is not needed. + /// + [MethodImpl(InliningOptions.ShortMethod)] + private void FlushRemainingBytes() + { + // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits + // And writing only valuable count of bytes count we want to write to the output stream + int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8); + uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount); + this.emitBuffer[--this.emitWriteIndex] = packedBytes; + + // Flush cached bytes to the output stream with padding bits + this.FlushToStream((this.emitWriteIndex * 4) - 4 + valuableBytesCount); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs new file mode 100644 index 0000000000..ab9462632f --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs @@ -0,0 +1,161 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Diagnostics; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components +{ + internal static partial class FastFloatingPointDCT + { +#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f); + private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f); + private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f); + private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f); + + private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f); + private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f); + private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f); + private static readonly Vector256 mm256_F_n0_8999 = Vector256.Create(-0.899976223f); + private static readonly Vector256 mm256_F_n2_5629 = Vector256.Create(-2.562915447f); + private static readonly Vector256 mm256_F_0_2986 = Vector256.Create(0.298631336f); + private static readonly Vector256 mm256_F_2_0531 = Vector256.Create(2.053119869f); + private static readonly Vector256 mm256_F_3_0727 = Vector256.Create(3.072711026f); + private static readonly Vector256 mm256_F_1_5013 = Vector256.Create(1.501321110f); + private static readonly Vector256 mm256_F_n1_8477 = Vector256.Create(-1.847759065f); + private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f); +#pragma warning restore SA1310, SA1311, IDE1006 + + /// + /// Apply floating point FDCT inplace using simd operations. + /// + /// Input matrix. + private static void ForwardTransform_Avx(ref Block8x8F block) + { + DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); + + // First pass - process rows + block.TransposeInplace(); + FDCT8x8_Avx(ref block); + + // Second pass - process columns + block.TransposeInplace(); + FDCT8x8_Avx(ref block); + } + + /// + /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix. + /// + /// + /// Requires Avx support. + /// + /// Input matrix. + public static void FDCT8x8_Avx(ref Block8x8F block) + { + DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation."); + + Vector256 tmp0 = Avx.Add(block.V0, block.V7); + Vector256 tmp7 = Avx.Subtract(block.V0, block.V7); + Vector256 tmp1 = Avx.Add(block.V1, block.V6); + Vector256 tmp6 = Avx.Subtract(block.V1, block.V6); + Vector256 tmp2 = Avx.Add(block.V2, block.V5); + Vector256 tmp5 = Avx.Subtract(block.V2, block.V5); + Vector256 tmp3 = Avx.Add(block.V3, block.V4); + Vector256 tmp4 = Avx.Subtract(block.V3, block.V4); + + // Even part + Vector256 tmp10 = Avx.Add(tmp0, tmp3); + Vector256 tmp13 = Avx.Subtract(tmp0, tmp3); + Vector256 tmp11 = Avx.Add(tmp1, tmp2); + Vector256 tmp12 = Avx.Subtract(tmp1, tmp2); + + block.V0 = Avx.Add(tmp10, tmp11); + block.V4 = Avx.Subtract(tmp10, tmp11); + + Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071); + block.V2 = Avx.Add(tmp13, z1); + block.V6 = Avx.Subtract(tmp13, z1); + + // Odd part + tmp10 = Avx.Add(tmp4, tmp5); + tmp11 = Avx.Add(tmp5, tmp6); + tmp12 = Avx.Add(tmp6, tmp7); + + Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826); + Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10); + Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12); + Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071); + + Vector256 z11 = Avx.Add(tmp7, z3); + Vector256 z13 = Avx.Subtract(tmp7, z3); + + block.V5 = Avx.Add(z13, z2); + block.V3 = Avx.Subtract(z13, z2); + block.V1 = Avx.Add(z11, z4); + block.V7 = Avx.Subtract(z11, z4); + } + + /// + /// Combined operation of and + /// using AVX commands. + /// + /// Source + /// Destination + public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) + { + Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); + + Vector256 my1 = s.V1; + Vector256 my7 = s.V7; + Vector256 mz0 = Avx.Add(my1, my7); + + Vector256 my3 = s.V3; + Vector256 mz2 = Avx.Add(my3, my7); + Vector256 my5 = s.V5; + Vector256 mz1 = Avx.Add(my3, my5); + Vector256 mz3 = Avx.Add(my1, my5); + + Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758); + + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901); + mz0 = Avx.Multiply(mz0, mm256_F_n0_8999); + mz1 = Avx.Multiply(mz1, mm256_F_n2_5629); + + Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2); + Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3); + Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2); + Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3); + + Vector256 my2 = s.V2; + Vector256 my6 = s.V6; + mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411); + Vector256 my0 = s.V0; + Vector256 my4 = s.V4; + mz0 = Avx.Add(my0, my4); + mz1 = Avx.Subtract(my0, my4); + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653); + + my0 = Avx.Add(mz0, mz3); + my3 = Avx.Subtract(mz0, mz3); + my1 = Avx.Add(mz1, mz2); + my2 = Avx.Subtract(mz1, mz2); + + d.V0 = Avx.Add(my0, mb0); + d.V7 = Avx.Subtract(my0, mb0); + d.V1 = Avx.Add(my1, mb1); + d.V6 = Avx.Subtract(my1, mb1); + d.V2 = Avx.Add(my2, mb2); + d.V5 = Avx.Subtract(my2, mb2); + d.V3 = Avx.Add(my3, mb3); + d.V4 = Avx.Subtract(my3, mb3); + } + } +} +#endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index 0f569b5da1..6963c36369 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,11 +1,9 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. -using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; #if SUPPORTS_RUNTIME_INTRINSICS -using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; #endif @@ -19,283 +17,304 @@ internal static partial class FastFloatingPointDCT { #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore private const float C_1_175876 = 1.175875602f; - private const float C_1_961571 = -1.961570560f; - private const float C_0_390181 = -0.390180644f; - private const float C_0_899976 = -0.899976223f; - private const float C_2_562915 = -2.562915447f; - private const float C_0_298631 = 0.298631336f; - private const float C_2_053120 = 2.053119869f; - private const float C_3_072711 = 3.072711026f; - private const float C_1_501321 = 1.501321110f; - private const float C_0_541196 = 0.541196100f; - private const float C_1_847759 = -1.847759065f; - private const float C_0_765367 = 0.765366865f; private const float C_0_125 = 0.1250f; -#if SUPPORTS_RUNTIME_INTRINSICS - private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f); - private static readonly Vector256 C_V_1_3065 = Vector256.Create(1.306563f); - private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f); - private static readonly Vector256 C_V_0_7856 = Vector256.Create(0.785695f); - private static readonly Vector256 C_V_1_3870 = Vector256.Create(1.387040f); - private static readonly Vector256 C_V_0_2758 = Vector256.Create(0.275899f); - - private static readonly Vector256 C_V_n1_9615 = Vector256.Create(-1.961570560f); - private static readonly Vector256 C_V_n0_3901 = Vector256.Create(-0.390180644f); - private static readonly Vector256 C_V_n0_8999 = Vector256.Create(-0.899976223f); - private static readonly Vector256 C_V_n2_5629 = Vector256.Create(-2.562915447f); - private static readonly Vector256 C_V_0_2986 = Vector256.Create(0.298631336f); - private static readonly Vector256 C_V_2_0531 = Vector256.Create(2.053119869f); - private static readonly Vector256 C_V_3_0727 = Vector256.Create(3.072711026f); - private static readonly Vector256 C_V_1_5013 = Vector256.Create(1.501321110f); - private static readonly Vector256 C_V_n1_8477 = Vector256.Create(-1.847759065f); - private static readonly Vector256 C_V_0_7653 = Vector256.Create(0.765366865f); - - private static readonly Vector256 C_V_InvSqrt2 = Vector256.Create(0.707107f); -#endif +#pragma warning disable SA1311, IDE1006 // naming rules violation warnings + private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f); + private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f); + private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f); + private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f); +#pragma warning restore SA1311, IDE1006 + #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore - private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f); /// - /// Original: - /// - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15 - /// + /// Gets reciprocal coefficients for jpeg quantization tables calculation. /// - /// Source - /// Destination - public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) + /// + /// + /// Current FDCT implementation expects its results to be multiplied by + /// a reciprocal quantization table. To get 8x8 reciprocal block values in this + /// table must be divided by quantization table values scaled with quality settings. + /// + /// + /// These values were calculates with this formula: + /// + /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8; + /// + /// Where: + /// + /// scalefactor[0] = 1 + /// + /// + /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 + /// + /// Values are also scaled by 8 so DCT code won't do extra division/multiplication. + /// + /// + internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[] { - Vector4 c0 = s.V0L; - Vector4 c1 = s.V7L; - Vector4 t0 = c0 + c1; - Vector4 t7 = c0 - c1; - - c1 = s.V6L; - c0 = s.V1L; - Vector4 t1 = c0 + c1; - Vector4 t6 = c0 - c1; - - c1 = s.V5L; - c0 = s.V2L; - Vector4 t2 = c0 + c1; - Vector4 t5 = c0 - c1; - - c0 = s.V3L; - c1 = s.V4L; - Vector4 t3 = c0 + c1; - Vector4 t4 = c0 - c1; - - c0 = t0 + t3; - Vector4 c3 = t0 - t3; - c1 = t1 + t2; - Vector4 c2 = t1 - t2; - - d.V0L = c0 + c1; - d.V4L = c0 - c1; - - float w0 = 0.541196f; - float w1 = 1.306563f; - - d.V2L = (w0 * c2) + (w1 * c3); - d.V6L = (w0 * c3) - (w1 * c2); - - w0 = 1.175876f; - w1 = 0.785695f; - c3 = (w0 * t4) + (w1 * t7); - c0 = (w0 * t7) - (w1 * t4); - - w0 = 1.387040f; - w1 = 0.275899f; - c2 = (w0 * t5) + (w1 * t6); - c1 = (w0 * t6) - (w1 * t5); - - d.V3L = c0 - c2; - d.V5L = c3 - c1; - - float invsqrt2 = 0.707107f; - c0 = (c0 + c2) * invsqrt2; - c3 = (c3 + c1) * invsqrt2; - - d.V1L = c0 + c3; - d.V7L = c0 - c3; - } + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f, + 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f, + 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f, + 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f, + 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f, + 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f, + 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f, + }; /// - /// Original: - /// - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15 - /// + /// Adjusts given quantization table to be complient with FDCT implementation. /// - /// Source - /// Destination - public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) + /// + /// See docs for explanation. + /// + /// Quantization table to adjust. + public static void AdjustToFDCT(ref Block8x8F quantizationtable) { - Vector4 c0 = s.V0R; - Vector4 c1 = s.V7R; - Vector4 t0 = c0 + c1; - Vector4 t7 = c0 - c1; - - c1 = s.V6R; - c0 = s.V1R; - Vector4 t1 = c0 + c1; - Vector4 t6 = c0 - c1; - - c1 = s.V5R; - c0 = s.V2R; - Vector4 t2 = c0 + c1; - Vector4 t5 = c0 - c1; - - c0 = s.V3R; - c1 = s.V4R; - Vector4 t3 = c0 + c1; - Vector4 t4 = c0 - c1; - - c0 = t0 + t3; - Vector4 c3 = t0 - t3; - c1 = t1 + t2; - Vector4 c2 = t1 - t2; - - d.V0R = c0 + c1; - d.V4R = c0 - c1; - - float w0 = 0.541196f; - float w1 = 1.306563f; - - d.V2R = (w0 * c2) + (w1 * c3); - d.V6R = (w0 * c3) - (w1 * c2); - - w0 = 1.175876f; - w1 = 0.785695f; - c3 = (w0 * t4) + (w1 * t7); - c0 = (w0 * t7) - (w1 * t4); - - w0 = 1.387040f; - w1 = 0.275899f; - c2 = (w0 * t5) + (w1 * t6); - c1 = (w0 * t6) - (w1 * t5); - - d.V3R = c0 - c2; - d.V5R = c3 - c1; - - c0 = (c0 + c2) * InvSqrt2; - c3 = (c3 + c1) * InvSqrt2; - - d.V1R = c0 + c3; - d.V7R = c0 - c3; + for (int i = 0; i < Block8x8F.Size; i++) + { + quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i]; + } } /// - /// Combined operation of and - /// using AVX commands. + /// Apply 2D floating point FDCT inplace. /// - /// Source - /// Destination - public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) + /// Input matrix. + public static void TransformFDCT(ref Block8x8F block) { #if SUPPORTS_RUNTIME_INTRINSICS - Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); - - Vector256 t0 = Avx.Add(s.V0, s.V7); - Vector256 t7 = Avx.Subtract(s.V0, s.V7); - Vector256 t1 = Avx.Add(s.V1, s.V6); - Vector256 t6 = Avx.Subtract(s.V1, s.V6); - Vector256 t2 = Avx.Add(s.V2, s.V5); - Vector256 t5 = Avx.Subtract(s.V2, s.V5); - Vector256 t3 = Avx.Add(s.V3, s.V4); - Vector256 t4 = Avx.Subtract(s.V3, s.V4); - - Vector256 c0 = Avx.Add(t0, t3); - Vector256 c1 = Avx.Add(t1, t2); - - // 0 4 - d.V0 = Avx.Add(c0, c1); - d.V4 = Avx.Subtract(c0, c1); - - Vector256 c3 = Avx.Subtract(t0, t3); - Vector256 c2 = Avx.Subtract(t1, t2); - - // 2 6 - d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065); - d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411); - - c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856); - c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758); - - c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6); - c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870); - - // 3 5 - d.V3 = Avx.Subtract(c0, c2); - d.V5 = Avx.Subtract(c3, c1); - - c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2); - c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2); - - // 1 7 - d.V1 = Avx.Add(c0, c3); - d.V7 = Avx.Subtract(c0, c3); + if (Avx.IsSupported) + { + ForwardTransform_Avx(ref block); + } + else #endif + if (Vector.IsHardwareAccelerated) + { + ForwardTransform_Vector4(ref block); + } + else + { + ForwardTransform_Scalar(ref block); + } } /// - /// Performs 8x8 matrix Forward Discrete Cosine Transform + /// Apply 2D floating point FDCT inplace using scalar operations. /// - /// Source - /// Destination - public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d) + /// + /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c. + /// + /// Input matrix. + private static void ForwardTransform_Scalar(ref Block8x8F block) { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) + const int dctSize = 8; + + float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + float tmp10, tmp11, tmp12, tmp13; + float z1, z2, z3, z4, z5, z11, z13; + + // First pass - process rows + ref float dataRef = ref Unsafe.As(ref block); + for (int ctr = 7; ctr >= 0; ctr--) { - FDCT8x8_Avx(ref s, ref d); + tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7); + tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7); + tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6); + tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6); + tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5); + tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5); + tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4); + tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4); + + // Even part + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * 0.707106781f; + Unsafe.Add(ref dataRef, 2) = tmp13 + z1; + Unsafe.Add(ref dataRef, 6) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433f; + z2 = (0.541196100f * tmp10) + z5; + z4 = (1.306562965f * tmp12) + z5; + z3 = tmp11 * 0.707106781f; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + Unsafe.Add(ref dataRef, 5) = z13 + z2; + Unsafe.Add(ref dataRef, 3) = z13 - z2; + Unsafe.Add(ref dataRef, 1) = z11 + z4; + Unsafe.Add(ref dataRef, 7) = z11 - z4; + + dataRef = ref Unsafe.Add(ref dataRef, dctSize); } - else -#endif + + // Second pass - process columns + dataRef = ref Unsafe.As(ref block); + for (int ctr = 7; ctr >= 0; ctr--) { - FDCT8x4_LeftPart(ref s, ref d); - FDCT8x4_RightPart(ref s, ref d); + tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7); + tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7); + tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6); + tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6); + tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5); + tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5); + tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4); + tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4); + + // Even part + tmp10 = tmp0 + tmp3; + tmp13 = tmp0 - tmp3; + tmp11 = tmp1 + tmp2; + tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11; + Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11; + + z1 = (tmp12 + tmp13) * 0.707106781f; + Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1; + Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + z5 = (tmp10 - tmp12) * 0.382683433f; + z2 = (0.541196100f * tmp10) + z5; + z4 = (1.306562965f * tmp12) + z5; + z3 = tmp11 * 0.707106781f; + + z11 = tmp7 + z3; + z13 = tmp7 - z3; + + Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2; + Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2; + Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4; + Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4; + + dataRef = ref Unsafe.Add(ref dataRef, 1); } } /// - /// Apply floating point FDCT from src into dest + /// Apply floating point FDCT inplace using API. /// - /// Source - /// Destination - /// Temporary block provided by the caller for optimization - /// If true, a constant -128.0 offset is applied for all values before FDCT - public static void TransformFDCT( - ref Block8x8F src, - ref Block8x8F dest, - ref Block8x8F temp, - bool offsetSourceByNeg128 = true) + /// + /// This implementation must be called only if hardware supports 4 + /// floating point numbers vector. Otherwise explicit scalar + /// implementation is faster + /// because it does not rely on matrix transposition. + /// + /// Input matrix. + private static void ForwardTransform_Vector4(ref Block8x8F block) { - src.TransposeInto(ref temp); - if (offsetSourceByNeg128) - { - temp.AddInPlace(-128F); - } + DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware."); - FDCT8x8(ref temp, ref dest); + // First pass - process rows + block.TransposeInplace(); + FDCT8x4_Vector4(ref block.V0L); + FDCT8x4_Vector4(ref block.V0R); - dest.TransposeInto(ref temp); + // Second pass - process columns + block.TransposeInplace(); + FDCT8x4_Vector4(ref block.V0L); + FDCT8x4_Vector4(ref block.V0R); + } - FDCT8x8(ref temp, ref dest); + /// + /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix. + /// + /// + /// Implemented using Vector4 API operations for either scalar or sse hardware implementation. + /// Must be called on both 8x4 matrix parts for the full FDCT transform. + /// + /// Input reference to the first + private static void FDCT8x4_Vector4(ref Vector4 blockRef) + { + Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14); + Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14); + Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12); + Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12); + Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10); + Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10); + Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8); + Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8); + + // Even part + Vector4 tmp10 = tmp0 + tmp3; + Vector4 tmp13 = tmp0 - tmp3; + Vector4 tmp11 = tmp1 + tmp2; + Vector4 tmp12 = tmp1 - tmp2; + + Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11; + Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11; + + Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071; + Unsafe.Add(ref blockRef, 4) = tmp13 + z1; + Unsafe.Add(ref blockRef, 12) = tmp13 - z1; + + // Odd part + tmp10 = tmp4 + tmp5; + tmp11 = tmp5 + tmp6; + tmp12 = tmp6 + tmp7; + + Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826; + Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5; + Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5; + Vector4 z3 = tmp11 * mm128_F_0_7071; + + Vector4 z11 = tmp7 + z3; + Vector4 z13 = tmp7 - z3; + + Unsafe.Add(ref blockRef, 10) = z13 + z2; + Unsafe.Add(ref blockRef, 6) = z13 - z2; + Unsafe.Add(ref blockRef, 2) = z11 + z4; + Unsafe.Add(ref blockRef, 14) = z11 - z4; + } - dest.MultiplyInPlace(C_0_125); + /// + /// Apply floating point IDCT inplace. + /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239. + /// + /// Input matrix. + /// Matrix to store temporal results. + public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp) + { + block.TransposeInplace(); + IDCT8x8(ref block, ref temp); + temp.TransposeInplace(); + IDCT8x8(ref temp, ref block); + + // TODO: This can be fused into quantization table step + block.MultiplyInPlace(C_0_125); } /// @@ -303,7 +322,7 @@ public static void TransformFDCT( /// /// Source /// Destination - public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) + private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) { #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) @@ -432,83 +451,5 @@ public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) d.V3R = my3 + mb3; d.V4R = my3 - mb3; } - - /// - /// Combined operation of and - /// using AVX commands. - /// - /// Source - /// Destination - public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) - { -#if SUPPORTS_RUNTIME_INTRINSICS - Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); - - Vector256 my1 = s.V1; - Vector256 my7 = s.V7; - Vector256 mz0 = Avx.Add(my1, my7); - - Vector256 my3 = s.V3; - Vector256 mz2 = Avx.Add(my3, my7); - Vector256 my5 = s.V5; - Vector256 mz1 = Avx.Add(my3, my5); - Vector256 mz3 = Avx.Add(my1, my5); - - Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758); - - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901); - mz0 = Avx.Multiply(mz0, C_V_n0_8999); - mz1 = Avx.Multiply(mz1, C_V_n2_5629); - - Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2); - Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3); - Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2); - Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3); - - Vector256 my2 = s.V2; - Vector256 my6 = s.V6; - mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411); - Vector256 my0 = s.V0; - Vector256 my4 = s.V4; - mz0 = Avx.Add(my0, my4); - mz1 = Avx.Subtract(my0, my4); - mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477); - mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653); - - my0 = Avx.Add(mz0, mz3); - my3 = Avx.Subtract(mz0, mz3); - my1 = Avx.Add(mz1, mz2); - my2 = Avx.Subtract(mz1, mz2); - - d.V0 = Avx.Add(my0, mb0); - d.V7 = Avx.Subtract(my0, mb0); - d.V1 = Avx.Add(my1, mb1); - d.V6 = Avx.Subtract(my1, mb1); - d.V2 = Avx.Add(my2, mb2); - d.V5 = Avx.Subtract(my2, mb2); - d.V3 = Avx.Add(my3, mb3); - d.V4 = Avx.Subtract(my3, mb3); -#endif - } - - /// - /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization). - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239 - /// - /// Source - /// Destination - /// Temporary block provided by the caller - public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp) - { - src.TransposeInto(ref temp); - - IDCT8x8(ref temp, ref dest); - dest.TransposeInto(ref temp); - IDCT8x8(ref temp, ref dest); - - // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? - dest.MultiplyInPlace(C_0_125); - } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs index 2ff56c63b9..eab5e6a082 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs @@ -39,53 +39,59 @@ internal static class Quantization public const int QualityEstimationConfidenceUpperThreshold = 98; /// - /// Gets the unscaled luminance quantization table in zig-zag order. Each - /// encoder copies and scales the tables according to its quality parameter. - /// The values are derived from ITU section K.1 after converting from natural to - /// zig-zag order. + /// Gets unscaled luminance quantization table. /// + /// + /// The values are derived from ITU section K.1. + /// // The C# compiler emits this as a compile-time constant embedded in the PE file. // This is effectively compiled down to: return new ReadOnlySpan(&data, length) // More details can be found: https://github.com/dotnet/roslyn/pull/24621 - public static ReadOnlySpan UnscaledQuant_Luminance => new byte[] + public static ReadOnlySpan LuminanceTable => new byte[] { - 16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24, - 40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60, - 57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80, - 109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112, - 100, 120, 92, 101, 103, 99, + 16, 11, 10, 16, 24, 40, 51, 61, + 12, 12, 14, 19, 26, 58, 60, 55, + 14, 13, 16, 24, 40, 57, 69, 56, + 14, 17, 22, 29, 51, 87, 80, 62, + 18, 22, 37, 56, 68, 109, 103, 77, + 24, 35, 55, 64, 81, 104, 113, 92, + 49, 64, 78, 87, 103, 121, 120, 101, + 72, 92, 95, 98, 112, 100, 103, 99, }; /// - /// Gets the unscaled chrominance quantization table in zig-zag order. Each - /// encoder copies and scales the tables according to its quality parameter. - /// The values are derived from ITU section K.1 after converting from natural to - /// zig-zag order. + /// Gets unscaled chrominance quantization table. /// + /// + /// The values are derived from ITU section K.1. + /// // The C# compiler emits this as a compile-time constant embedded in the PE file. // This is effectively compiled down to: return new ReadOnlySpan(&data, length) // More details can be found: https://github.com/dotnet/roslyn/pull/24621 - public static ReadOnlySpan UnscaledQuant_Chrominance => new byte[] + public static ReadOnlySpan ChrominanceTable => new byte[] { - 17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, + 17, 18, 24, 47, 99, 99, 99, 99, + 18, 21, 26, 66, 99, 99, 99, 99, + 24, 26, 56, 99, 99, 99, 99, 99, + 47, 66, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, }; /// Ported from JPEGsnoop: /// https://github.com/ImpulseAdventure/JPEGsnoop/blob/9732ee0961f100eb69bbff4a0c47438d5997abee/source/JfifDecode.cpp#L4570-L4694 /// - /// Estimates jpeg quality based on quantization table in zig-zag order. + /// Estimates jpeg quality based on standard quantization table. /// /// - /// This technically can be used with any given table but internal decoder code uses ITU spec tables: - /// and . + /// Technically, this can be used with any given table but internal decoder code uses ITU spec tables: + /// and . /// /// Input quantization table. - /// Quantization to estimate against. - /// Estimated quality + /// Natural order quantization table to estimate against. + /// Estimated quality. public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan target) { // This method can be SIMD'ified if standard table is injected as Block8x8F. @@ -106,11 +112,10 @@ public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan target int quality; for (int i = 0; i < Block8x8F.Size; i++) { - float coeff = table[i]; - int coeffInteger = (int)coeff; + int coeff = (int)table[i]; // Coefficients are actually int16 casted to float numbers so there's no truncating error. - if (coeffInteger != 0) + if (coeff != 0) { comparePercent = 100.0 * (table[i] / target[i]); } @@ -152,7 +157,7 @@ public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan target /// Estimated quality [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable) - => EstimateQuality(ref luminanceTable, UnscaledQuant_Luminance); + => EstimateQuality(ref luminanceTable, LuminanceTable); /// /// Estimates jpeg quality based on quantization table in zig-zag order. @@ -161,7 +166,7 @@ public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable) /// Estimated quality [MethodImpl(MethodImplOptions.AggressiveInlining)] public static int EstimateChrominanceQuality(ref Block8x8F chrominanceTable) - => EstimateQuality(ref chrominanceTable, UnscaledQuant_Chrominance); + => EstimateQuality(ref chrominanceTable, ChrominanceTable); [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int QualityToScale(int quality) @@ -185,10 +190,10 @@ private static Block8x8F ScaleQuantizationTable(int scale, ReadOnlySpan un [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Block8x8F ScaleLuminanceTable(int quality) - => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Luminance); + => ScaleQuantizationTable(scale: QualityToScale(quality), LuminanceTable); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Block8x8F ScaleChrominanceTable(int quality) - => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Chrominance); + => ScaleQuantizationTable(scale: QualityToScale(quality), ChrominanceTable); } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs new file mode 100644 index 0000000000..6577739c1a --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs @@ -0,0 +1,300 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +#if SUPPORTS_RUNTIME_INTRINSICS +using System; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components +{ + internal static partial class ZigZag + { +#pragma warning disable SA1309 // naming rules violation warnings + /// + /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics. + /// + private const byte _ = 0xff; +#pragma warning restore SA1309 + + /// + /// Gets shuffle vectors for + /// zig zag implementation. + /// + private static ReadOnlySpan SseShuffleMasks => new byte[] + { + // row0 + 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _, + _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5, + _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, + + // row1 + _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11, + 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _, + _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _, + + // row2 + _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5, + _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _, + + // row3 + _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _, + _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _, + _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, + 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, + + // row4 + _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _, + _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _, + _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _, + + // row5 + _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _, + 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _, + + // row6 + _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _, + _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13, + 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, + + // row7 + 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _, + _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15 + }; + + /// + /// Gets shuffle vectors for + /// zig zag implementation. + /// + private static ReadOnlySpan AvxShuffleMasks => new byte[] + { + // 01_AB/01_EF/23_CD - cross-lane + 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, + + // 01_AB - inner-lane + 0, 1, 2, 3, 8, 9, _, _, 10, 11, 4, 5, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, 6, 7, + + // 01_CD/23_GH - cross-lane + 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, + + // 01_CD - inner-lane + _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, + + // 01_EF - inner-lane + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, + + // 23_AB/45_CD/67_EF - cross-lane + 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, + + // 23_AB - inner-lane + 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, 2, 3, 8, 9, _, _, _, _, + + // 23_CD - inner-lane + _, _, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, 6, 7, 12, 13, + + // 23_EF - inner-lane + _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, + + // 23_GH - inner-lane + _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, + + // 45_AB - inner-lane + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, + + // 45_CD - inner-lane + _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, + + // 45_EF - cross-lane + 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, _, _, _, _, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, + + // 45_EF - inner-lane + 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, + + // 45_GH - inner-lane + _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, + + // 67_CD - inner-lane + _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, + + // 67_EF - inner-lane + _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, + + // 67_GH - inner-lane + 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, 6, 7, 12, 13, 14, 15 + }; + + /// + /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics. + /// + /// Input matrix. + public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block) + { + DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!"); + + fixed (byte* maskPtr = SseShuffleMasks) + { + Vector128 rowA = block.V0.AsByte(); + Vector128 rowB = block.V1.AsByte(); + Vector128 rowC = block.V2.AsByte(); + Vector128 rowD = block.V3.AsByte(); + Vector128 rowE = block.V4.AsByte(); + Vector128 rowF = block.V5.AsByte(); + Vector128 rowG = block.V6.AsByte(); + Vector128 rowH = block.V7.AsByte(); + + // row0 - A0 A1 B0 C0 B1 A2 A3 B2 + Vector128 rowA0 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 0))).AsInt16(); + Vector128 rowB0 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 1))).AsInt16(); + Vector128 row0 = Sse2.Or(rowA0, rowB0); + Vector128 rowC0 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 2))).AsInt16(); + row0 = Sse2.Or(row0, rowC0); + + // row1 - C1 D0 E0 D1 C2 B3 A4 A5 + Vector128 rowA1 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 3))).AsInt16(); + Vector128 rowC1 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 4))).AsInt16(); + Vector128 row1 = Sse2.Or(rowA1, rowC1); + Vector128 rowD1 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 5))).AsInt16(); + row1 = Sse2.Or(row1, rowD1); + row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 3), 5).AsInt16(); + row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 2).AsInt16(); + + // row2 + Vector128 rowE2 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 6))).AsInt16(); + Vector128 rowF2 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 7))).AsInt16(); + Vector128 row2 = Sse2.Or(rowE2, rowF2); + row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 4), 0).AsInt16(); + row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 3), 1).AsInt16(); + row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 2).AsInt16(); + row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 0), 5).AsInt16(); + + // row3 + Vector128 rowA3 = Ssse3.Shuffle(rowA, Sse2.LoadVector128(maskPtr + (16 * 8))).AsInt16().AsInt16(); + Vector128 rowB3 = Ssse3.Shuffle(rowB, Sse2.LoadVector128(maskPtr + (16 * 9))).AsInt16().AsInt16(); + Vector128 row3 = Sse2.Or(rowA3, rowB3); + Vector128 rowC3 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 10))).AsInt16(); + row3 = Sse2.Or(row3, rowC3); + Vector128 shuffleRowD3EF = Sse2.LoadVector128(maskPtr + (16 * 11)); + Vector128 rowD3 = Ssse3.Shuffle(rowD, shuffleRowD3EF).AsInt16(); + row3 = Sse2.Or(row3, rowD3); + + // row4 + Vector128 rowE4 = Ssse3.Shuffle(rowE, shuffleRowD3EF).AsInt16(); + Vector128 rowF4 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 12))).AsInt16(); + Vector128 row4 = Sse2.Or(rowE4, rowF4); + Vector128 rowG4 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 13))).AsInt16(); + row4 = Sse2.Or(row4, rowG4); + Vector128 rowH4 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 14))).AsInt16(); + row4 = Sse2.Or(row4, rowH4); + + // row5 + Vector128 rowC5 = Ssse3.Shuffle(rowC, Sse2.LoadVector128(maskPtr + (16 * 15))).AsInt16(); + Vector128 rowD5 = Ssse3.Shuffle(rowD, Sse2.LoadVector128(maskPtr + (16 * 16))).AsInt16(); + Vector128 row5 = Sse2.Or(rowC5, rowD5); + row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowB.AsUInt16(), 7), 2).AsInt16(); + row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 5).AsInt16(); + row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 4), 6).AsInt16(); + row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 3), 7).AsInt16(); + + // row6 + Vector128 rowE6 = Ssse3.Shuffle(rowE, Sse2.LoadVector128(maskPtr + (16 * 17))).AsInt16(); + Vector128 rowF6 = Ssse3.Shuffle(rowF, Sse2.LoadVector128(maskPtr + (16 * 18))).AsInt16(); + Vector128 row6 = Sse2.Or(rowE6, rowF6); + Vector128 rowH6 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 19))).AsInt16(); + row6 = Sse2.Or(row6, rowH6); + row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 5).AsInt16(); + row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowG.AsUInt16(), 4), 2).AsInt16(); + + // row7 + Vector128 rowG7 = Ssse3.Shuffle(rowG, Sse2.LoadVector128(maskPtr + (16 * 20))).AsInt16(); + Vector128 rowH7 = Ssse3.Shuffle(rowH, Sse2.LoadVector128(maskPtr + (16 * 21))).AsInt16(); + Vector128 row7 = Sse2.Or(rowG7, rowH7); + row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 7), 4).AsInt16(); + + block.V0 = row0; + block.V1 = row1; + block.V2 = row2; + block.V3 = row3; + block.V4 = row4; + block.V5 = row5; + block.V6 = row6; + block.V7 = row7; + } + } + + /// + /// Applies zig zag ordering for given 8x8 matrix using AVX cpu intrinsics. + /// + /// Input matrix. + public static unsafe void ApplyZigZagOrderingAvx2(ref Block8x8 block) + { + DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); + + fixed (byte* shuffleVectorsPtr = AvxShuffleMasks) + { + Vector256 rowsAB = block.V01.AsByte(); + Vector256 rowsCD = block.V23.AsByte(); + Vector256 rowsEF = block.V45.AsByte(); + Vector256 rowsGH = block.V67.AsByte(); + + // rows 0 1 + Vector256 rows_AB01_EF01_CD23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (0 * 32)).AsInt32(); + Vector256 row01_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); + row01_AB = Avx2.Shuffle(row01_AB, Avx.LoadVector256(shuffleVectorsPtr + (1 * 32))).AsByte(); + + Vector256 rows_CD01_GH23_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (2 * 32)).AsInt32(); + Vector256 row01_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); + row01_CD = Avx2.Shuffle(row01_CD, Avx.LoadVector256(shuffleVectorsPtr + (3 * 32))).AsByte(); + + Vector256 row0123_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); + Vector256 row01_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (4 * 32))).AsByte(); + + Vector256 row01 = Avx2.Or(Avx2.Or(row01_AB, row01_CD), row01_EF); + + // rows 2 3 + Vector256 rows_AB23_CD45_EF67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (5 * 32)).AsInt32(); + Vector256 row2345_AB = Avx2.PermuteVar8x32(rowsAB.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); + Vector256 row23_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (6 * 32))).AsByte(); + + Vector256 row23_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB01_EF01_CD23_shuffleMask).AsByte(); + row23_CD = Avx2.Shuffle(row23_CD, Avx.LoadVector256(shuffleVectorsPtr + (7 * 32))).AsByte(); + + Vector256 row23_EF = Avx2.Shuffle(row0123_EF, Avx.LoadVector256(shuffleVectorsPtr + (8 * 32))).AsByte(); + + Vector256 row2345_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_CD01_GH23_shuffleMask).AsByte(); + Vector256 row23_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (9 * 32)).AsByte()); + + Vector256 row23 = Avx2.Or(Avx2.Or(row23_AB, row23_CD), Avx2.Or(row23_EF, row23_GH)); + + // rows 4 5 + Vector256 row45_AB = Avx2.Shuffle(row2345_AB, Avx.LoadVector256(shuffleVectorsPtr + (10 * 32)).AsByte()); + Vector256 row4567_CD = Avx2.PermuteVar8x32(rowsCD.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); + Vector256 row45_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (11 * 32)).AsByte()); + + Vector256 rows_EF45_GH67_shuffleMask = Avx.LoadVector256(shuffleVectorsPtr + (12 * 32)).AsInt32(); + Vector256 row45_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); + row45_EF = Avx2.Shuffle(row45_EF, Avx.LoadVector256(shuffleVectorsPtr + (13 * 32)).AsByte()); + + Vector256 row45_GH = Avx2.Shuffle(row2345_GH, Avx.LoadVector256(shuffleVectorsPtr + (14 * 32)).AsByte()); + + Vector256 row45 = Avx2.Or(Avx2.Or(row45_AB, row45_CD), Avx2.Or(row45_EF, row45_GH)); + + // rows 6 7 + Vector256 row67_CD = Avx2.Shuffle(row4567_CD, Avx.LoadVector256(shuffleVectorsPtr + (15 * 32)).AsByte()); + + Vector256 row67_EF = Avx2.PermuteVar8x32(rowsEF.AsInt32(), rows_AB23_CD45_EF67_shuffleMask).AsByte(); + row67_EF = Avx2.Shuffle(row67_EF, Avx.LoadVector256(shuffleVectorsPtr + (16 * 32)).AsByte()); + + Vector256 row67_GH = Avx2.PermuteVar8x32(rowsGH.AsInt32(), rows_EF45_GH67_shuffleMask).AsByte(); + row67_GH = Avx2.Shuffle(row67_GH, Avx.LoadVector256(shuffleVectorsPtr + (17 * 32)).AsByte()); + + Vector256 row67 = Avx2.Or(Avx2.Or(row67_CD, row67_EF), row67_GH); + + block.V01 = row01.AsInt16(); + block.V23 = row23.AsInt16(); + block.V45 = row45.AsInt16(); + block.V67 = row67.AsInt16(); + } + } + } +} +#endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs index 737652d4e6..e519a8a1dc 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.cs @@ -2,21 +2,15 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; namespace SixLabors.ImageSharp.Formats.Jpeg.Components { - /// - /// Holds the Jpeg UnZig array in a value/stack type. - /// Unzig maps from the zigzag ordering to the natural ordering. For example, - /// unzig[3] is the column and row of the fourth element in zigzag order. The - /// value is 16, which means first column (16%8 == 0) and third row (16/8 == 2). - /// - [StructLayout(LayoutKind.Sequential)] - internal unsafe struct ZigZag + internal static partial class ZigZag { /// + /// Gets span of zig-zag ordering indices. + /// + /// /// When reading corrupted data, the Huffman decoders could attempt /// to reference an entry beyond the end of this array (if the decoded /// zero run length reaches past the end of the block). To prevent @@ -25,20 +19,8 @@ internal unsafe struct ZigZag /// to be stored in location 63 of the block, not somewhere random. /// The worst case would be a run-length of 15, which means we need 16 /// fake entries. - /// - private const int Size = 64 + 16; - - /// - /// Copy of in a value type - /// - public fixed byte Data[Size]; - - /// - /// Gets the unzigs map, which maps from the zigzag ordering to the natural ordering. - /// For example, unzig[3] is the column and row of the fourth element in zigzag order. - /// The value is 16, which means first column (16%8 == 0) and third row (16/8 == 2). - /// - private static ReadOnlySpan Unzig => new byte[] + /// + public static ReadOnlySpan ZigZagOrder => new byte[] { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, @@ -48,53 +30,10 @@ internal unsafe struct ZigZag 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, - 63, 63, 63, 63, 63, 63, 63, 63, // Extra entries for safety in decoder + + // Extra entries for safety in decoder + 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }; - - /// - /// Returns the value at the given index - /// - /// The index - /// The - public byte this[int idx] - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get - { - ref byte self = ref Unsafe.As(ref this); - return Unsafe.Add(ref self, idx); - } - } - - /// - /// Creates and fills an instance of with Jpeg unzig indices - /// - /// The new instance - public static ZigZag CreateUnzigTable() - { - ZigZag result = default; - ref byte sourceRef = ref MemoryMarshal.GetReference(Unzig); - ref byte destinationRef = ref Unsafe.AsRef(result.Data); - - Unzig.CopyTo(new Span(result.Data, Size)); - - return result; - } - - /// - /// Apply Zigging to the given quantization table, so it will be sufficient to multiply blocks for dequantizing them. - /// - public static Block8x8F CreateDequantizationTable(ref Block8x8F qt) - { - Block8x8F result = default; - - for (int i = 0; i < Block8x8F.Size; i++) - { - result[Unzig[i]] = qt[i]; - } - - return result; - } } } diff --git a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs index dfef139ab0..a0f69bb7bf 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs @@ -887,9 +887,10 @@ private void ProcessDefineQuantizationTablesMarker(BufferedReadStream stream, in stream.Read(this.temp, 0, 64); remaining -= 64; + // Parsing quantization table & saving it in natural order for (int j = 0; j < 64; j++) { - table[j] = this.temp[j]; + table[ZigZag.ZigZagOrder[j]] = this.temp[j]; } break; @@ -907,9 +908,10 @@ private void ProcessDefineQuantizationTablesMarker(BufferedReadStream stream, in stream.Read(this.temp, 0, 128); remaining -= 128; + // Parsing quantization table & saving it in natural order for (int j = 0; j < 64; j++) { - table[j] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1]; + table[ZigZag.ZigZagOrder[j]] = (this.temp[2 * j] << 8) | this.temp[(2 * j) + 1]; } break; @@ -1069,13 +1071,13 @@ private void ProcessDefineHuffmanTablesMarker(BufferedReadStream stream, int rem // Types 0..1 DC..AC if (tableType > 1) { - JpegThrowHelper.ThrowInvalidImageContentException("Bad Huffman Table type."); + JpegThrowHelper.ThrowInvalidImageContentException($"Bad huffman table type: {tableType}"); } // Max tables of each type if (tableIndex > 3) { - JpegThrowHelper.ThrowInvalidImageContentException("Bad Huffman Table index."); + JpegThrowHelper.ThrowInvalidImageContentException($"Bad huffman table index: {tableIndex}"); } stream.Read(huffmanDataSpan, 0, 16); diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs index 270a11ed6c..6ff8876672 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs @@ -131,28 +131,23 @@ public void Encode(Image image, Stream stream, CancellationToken this.WriteStartOfScan(componentCount, componentIds); // Write the scan compressed data. - var scanEncoder = new HuffmanScanEncoder(stream); - if (this.colorType == JpegColorType.Luminance) - { - // luminance quantization table only. - scanEncoder.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken); - } - else - { - // luminance and chrominance quantization tables. - switch (this.colorType) - { - case JpegColorType.YCbCrRatio444: - case JpegColorType.Luminance: - scanEncoder.Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); - break; - case JpegColorType.YCbCrRatio420: - scanEncoder.Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); - break; - case JpegColorType.Rgb: - scanEncoder.EncodeRgb(image, ref luminanceQuantTable, cancellationToken); - break; - } + switch (this.colorType) + { + case JpegColorType.YCbCrRatio444: + new HuffmanScanEncoder(3, stream).Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); + break; + case JpegColorType.YCbCrRatio420: + new HuffmanScanEncoder(6, stream).Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); + break; + case JpegColorType.Luminance: + new HuffmanScanEncoder(1, stream).EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken); + break; + case JpegColorType.Rgb: + new HuffmanScanEncoder(3, stream).EncodeRgb(image, ref luminanceQuantTable, cancellationToken); + break; + default: + // all other non-supported color types are checked at the start of this method + break; } // Write the End Of Image marker. @@ -193,7 +188,7 @@ private static void WriteDataToDqt(byte[] dqt, ref int offset, QuantIndex i, ref dqt[offset++] = (byte)i; for (int j = 0; j < Block8x8F.Size; j++) { - dqt[offset++] = (byte)quant[j]; + dqt[offset++] = (byte)quant[ZigZag.ZigZagOrder[j]]; } } @@ -735,11 +730,15 @@ private void WriteMarkerHeader(byte marker, int length) /// Initializes quantization tables. /// /// + /// + /// Zig-zag ordering is NOT applied to the resulting tables. + /// + /// /// We take quality values in a hierarchical order: /// 1. Check if encoder has set quality - /// 2. Check if metadata has special table for encoding - /// 3. Check if metadata has set quality - /// 4. Take default quality value - 75 + /// 2. Check if metadata has set quality + /// 3. Take default quality value - 75 + /// /// /// Color components count. /// Jpeg metadata instance. diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs index bd1c496b49..e764c014d3 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs @@ -65,22 +65,21 @@ protected override void Decompress(BufferedReadStream stream, int byteCount, int scanDecoder.ResetInterval = 0; jpegDecoder.ParseStream(stream, scanDecoder, CancellationToken.None); - using var image = new Image(this.configuration, spectralConverter.PixelBuffer, new ImageMetadata()); - CopyImageBytesToBuffer(buffer, image); + CopyImageBytesToBuffer(buffer, spectralConverter.PixelBuffer); } else { using var image = Image.Load(stream); - CopyImageBytesToBuffer(buffer, image); + CopyImageBytesToBuffer(buffer, image.Frames.RootFrame.PixelBuffer); } } - private static void CopyImageBytesToBuffer(Span buffer, Image image) + private static void CopyImageBytesToBuffer(Span buffer, Buffer2D pixelBuffer) { int offset = 0; - for (int y = 0; y < image.Height; y++) + for (int y = 0; y < pixelBuffer.Height; y++) { - Span pixelRowSpan = image.GetPixelRowSpan(y); + Span pixelRowSpan = pixelBuffer.GetRowSpan(y); Span rgbBytes = MemoryMarshal.AsBytes(pixelRowSpan); rgbBytes.CopyTo(buffer.Slice(offset)); offset += rgbBytes.Length; diff --git a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs index 45be3dd038..aefec7fa34 100644 --- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs +++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/RgbJpegSpectralConverter.cs @@ -28,6 +28,6 @@ public RgbJpegSpectralConverter(Configuration configuration, CancellationToken c } /// - public override JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(JpegColorSpace.RGB, frame.Precision); + protected override JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(JpegColorSpace.RGB, frame.Precision); } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs new file mode 100644 index 0000000000..898bbdb456 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Quantize.cs @@ -0,0 +1,50 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg.Components; + +namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Block8x8F_Quantize + { + private Block8x8F block = CreateFromScalar(1); + private Block8x8F quant = CreateFromScalar(1); + private Block8x8 result = default; + + [Benchmark] + public short Quantize() + { + Block8x8F.Quantize(ref this.block, ref this.result, ref this.quant); + return this.result[0]; + } + + private static Block8x8F CreateFromScalar(float scalar) + { + Block8x8F block = default; + for (int i = 0; i < 64; i++) + { + block[i] = scalar; + } + + return block; + } + } +} + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1165 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 2. SSE : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 3. AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +| Method | Job | Mean | Error | StdDev | Ratio | +|--------- |-----------------|---------:|---------:|---------:|------:| +| Quantize | No HwIntrinsics | 73.34 ns | 1.081 ns | 1.011 ns | 1.00 | +| Quantize | SSE | 24.11 ns | 0.298 ns | 0.279 ns | 0.33 | +| Quantize | AVX | 15.90 ns | 0.074 ns | 0.065 ns | 0.22 | + */ diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs index 1d103cd1a0..c2efb517a1 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs @@ -9,29 +9,44 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Block8x8F_Transpose { - private static readonly Block8x8F Source = Create8x8FloatData(); + private Block8x8F source = Create8x8FloatData(); [Benchmark] - public void TransposeInto() + public float TransposeInplace() { - var dest = default(Block8x8F); - Source.TransposeInto(ref dest); + this.source.TransposeInplace(); + return this.source[0]; } private static Block8x8F Create8x8FloatData() { - var result = new float[64]; + Block8x8F block = default; for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { - result[(i * 8) + j] = (i * 10) + j; + block[(i * 8) + j] = (i * 10) + j; } } - var source = default(Block8x8F); - source.LoadFrom(result); - return source; + return block; } } } + +/* +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042.1237 (20H2/October2020Update) +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 1. No HwIntrinsics : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 2. SSE : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + 3. AVX : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + +Runtime=.NET Core 3.1 + +| Method | Job | Mean | Error | StdDev | Ratio | +|----------------- |----------------:|----------:|----------:|----------:|------:| +| TransposeInplace | No HwIntrinsics | 12.531 ns | 0.0637 ns | 0.0565 ns | 1.00 | +| TransposeInplace | AVX | 5.767 ns | 0.0529 ns | 0.0495 ns | 0.46 | +*/ diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs index 508b4b3b09..0e9bed1d9e 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs @@ -111,24 +111,24 @@ private static ImageCodecInfo GetEncoder(ImageFormat format) } /* -BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042 +BenchmarkDotNet=v0.13.0, OS=Windows 10.0.19042 Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores -.NET Core SDK=6.0.100-preview.3.21202.5 - [Host] : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT - DefaultJob : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT +.NET SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT + DefaultJob : .NET Core 3.1.18 (CoreCLR 4.700.21.35901, CoreFX 4.700.21.36305), X64 RyuJIT | Method | Quality | Mean | Error | StdDev | Ratio | |---------------------------- |-------- |---------:|---------:|---------:|------:| -| 'System.Drawing Jpeg 4:2:0' | 75 | 29.41 ms | 0.108 ms | 0.096 ms | 1.00 | -| 'ImageSharp Jpeg 4:2:0' | 75 | 26.30 ms | 0.131 ms | 0.109 ms | 0.89 | -| 'ImageSharp Jpeg 4:4:4' | 75 | 36.70 ms | 0.303 ms | 0.269 ms | 1.25 | +| 'System.Drawing Jpeg 4:2:0' | 75 | 30.04 ms | 0.540 ms | 0.479 ms | 1.00 | +| 'ImageSharp Jpeg 4:2:0' | 75 | 19.32 ms | 0.290 ms | 0.257 ms | 0.64 | +| 'ImageSharp Jpeg 4:4:4' | 75 | 26.76 ms | 0.332 ms | 0.294 ms | 0.89 | | | | | | | | -| 'System.Drawing Jpeg 4:2:0' | 90 | 32.67 ms | 0.226 ms | 0.211 ms | 1.00 | -| 'ImageSharp Jpeg 4:2:0' | 90 | 33.56 ms | 0.237 ms | 0.222 ms | 1.03 | -| 'ImageSharp Jpeg 4:4:4' | 90 | 44.82 ms | 0.250 ms | 0.234 ms | 1.37 | +| 'System.Drawing Jpeg 4:2:0' | 90 | 32.82 ms | 0.184 ms | 0.163 ms | 1.00 | +| 'ImageSharp Jpeg 4:2:0' | 90 | 25.00 ms | 0.408 ms | 0.361 ms | 0.76 | +| 'ImageSharp Jpeg 4:4:4' | 90 | 31.83 ms | 0.636 ms | 0.595 ms | 0.97 | | | | | | | | -| 'System.Drawing Jpeg 4:2:0' | 100 | 39.06 ms | 0.233 ms | 0.218 ms | 1.00 | -| 'ImageSharp Jpeg 4:2:0' | 100 | 40.23 ms | 0.225 ms | 0.277 ms | 1.03 | -| 'ImageSharp Jpeg 4:4:4' | 100 | 63.35 ms | 0.486 ms | 0.431 ms | 1.62 | +| 'System.Drawing Jpeg 4:2:0' | 100 | 39.30 ms | 0.359 ms | 0.318 ms | 1.00 | +| 'ImageSharp Jpeg 4:2:0' | 100 | 34.49 ms | 0.265 ms | 0.235 ms | 0.88 | +| 'ImageSharp Jpeg 4:4:4' | 100 | 56.40 ms | 0.565 ms | 0.501 ms | 1.44 | */ diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index 5ceb4c8a00..ffe0f4c020 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -65,17 +65,17 @@ public HwIntrinsics_SSE_AVX() .WithId("1. No HwIntrinsics").AsBaseline()); #if SUPPORTS_RUNTIME_INTRINSICS - if (Avx.IsSupported) + if (Sse.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithId("2. AVX")); + .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) + .WithId("2. SSE")); } - if (Sse.IsSupported) + if (Avx.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) - .WithId("3. SSE")); + .WithId("3. AVX")); } #endif } diff --git a/tests/ImageSharp.Benchmarks/Program.cs b/tests/ImageSharp.Benchmarks/Program.cs index 8080825d9f..f6ffa6f809 100644 --- a/tests/ImageSharp.Benchmarks/Program.cs +++ b/tests/ImageSharp.Benchmarks/Program.cs @@ -1,8 +1,6 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. -using System.Reflection; - using BenchmarkDotNet.Running; namespace SixLabors.ImageSharp.Benchmarks @@ -15,9 +13,8 @@ public class Program /// /// The arguments to pass to the program. /// - public static void Main(string[] args) - { - new BenchmarkSwitcher(typeof(Program).GetTypeInfo().Assembly).Run(args); - } + public static void Main(string[] args) => BenchmarkSwitcher + .FromAssembly(typeof(Program).Assembly) + .Run(args); } } diff --git a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs index e6e82b9810..51d616fc72 100644 --- a/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs +++ b/tests/ImageSharp.Tests.ProfilingSandbox/Program.cs @@ -1,6 +1,3 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - using System; using SixLabors.ImageSharp.Tests.Formats.Jpg; using SixLabors.ImageSharp.Tests.PixelFormats.PixelOperations; diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index c68b0ffa85..d01b4b501c 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -4,7 +4,9 @@ // Uncomment this to turn unit tests into benchmarks: // #define BENCHMARKING using System; -using System.Diagnostics; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils; @@ -164,52 +166,27 @@ public void Load_Store_IntArray() } [Fact] - public void TransposeInto() + public void TransposeInplace() { static void RunTest() { float[] expected = Create8x8FloatData(); ReferenceImplementations.Transpose8x8(expected); - var source = default(Block8x8F); - source.LoadFrom(Create8x8FloatData()); + var block8x8 = default(Block8x8F); + block8x8.LoadFrom(Create8x8FloatData()); - var dest = default(Block8x8F); - source.TransposeInto(ref dest); + block8x8.TransposeInplace(); float[] actual = new float[64]; - dest.ScaledCopyTo(actual); + block8x8.ScaledCopyTo(actual); Assert.Equal(expected, actual); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX); - } - - private class BufferHolder - { - public Block8x8F Buffer; - } - - [Fact] - public void TransposeInto_Benchmark() - { - var source = new BufferHolder(); - source.Buffer.LoadFrom(Create8x8FloatData()); - var dest = new BufferHolder(); - - this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark X {Times} ..."); - var sw = Stopwatch.StartNew(); - - for (int i = 0; i < Times; i++) - { - source.Buffer.TransposeInto(ref dest.Buffer); - } - - sw.Stop(); - this.Output.WriteLine($"TransposeInto_PinningImpl_Benchmark finished in {sw.ElapsedMilliseconds} ms"); + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); } private static float[] Create8x8ColorCropTestData() @@ -273,32 +250,44 @@ public void NormalizeColorsAndRoundAvx2(int seed) } [Theory] - [InlineData(1)] - [InlineData(2)] - public unsafe void Quantize(int seed) + [InlineData(1, 2)] + [InlineData(2, 1)] + public void Quantize(int srcSeed, int qtSeed) { - var block = default(Block8x8F); - block.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed)); - - var qt = default(Block8x8F); - qt.LoadFrom(Create8x8RoundedRandomFloatData(-2000, 2000, seed)); + static void RunTest(string srcSeedSerialized, string qtSeedSerialized) + { + int srcSeed = FeatureTestRunner.Deserialize(srcSeedSerialized); + int qtSeed = FeatureTestRunner.Deserialize(qtSeedSerialized); - var unzig = ZigZag.CreateUnzigTable(); + Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed); - int* expectedResults = stackalloc int[Block8x8F.Size]; - ReferenceImplementations.QuantizeRational(&block, expectedResults, &qt, unzig.Data); + // Quantization code is used only in jpeg where it's guaranteed that + // qunatization valus are greater than 1 + // Quantize method supports negative numbers by very small numbers can cause troubles + Block8x8F quant = CreateRandomFloatBlock(1, 2000, qtSeed); - var actualResults = default(Block8x8F); + // Reference implementation quantizes given block via division + Block8x8 expected = default; + ReferenceImplementations.Quantize(ref source, ref expected, ref quant, ZigZag.ZigZagOrder); - Block8x8F.Quantize(ref block, ref actualResults, ref qt, ref unzig); + // Actual current implementation quantizes given block via multiplication + // With quantization table reciprocal + for (int i = 0; i < Block8x8F.Size; i++) + { + quant[i] = 1f / quant[i]; + } - for (int i = 0; i < Block8x8F.Size; i++) - { - int expected = expectedResults[i]; - int actual = (int)actualResults[i]; + Block8x8 actual = default; + Block8x8F.Quantize(ref source, ref actual, ref quant); - Assert.Equal(expected, actual); + Assert.True(CompareBlocks(expected, actual, 1, out int diff), $"Blocks are not equal, diff={diff}"); } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + srcSeed, + qtSeed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); } [Fact] @@ -368,48 +357,6 @@ static void RunTest() HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX); } - [Theory] - [InlineData(1)] - [InlineData(2)] - [InlineData(3)] - public unsafe void DequantizeBlock(int seed) - { - Block8x8F original = CreateRandomFloatBlock(-500, 500, seed); - Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42); - - var unzig = ZigZag.CreateUnzigTable(); - - Block8x8F expected = original; - Block8x8F actual = original; - - ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data); - Block8x8F.DequantizeBlock(&actual, &qt, unzig.Data); - - this.CompareBlocks(expected, actual, 0); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - [InlineData(3)] - public unsafe void ZigZag_CreateDequantizationTable_MultiplicationShouldQuantize(int seed) - { - Block8x8F original = CreateRandomFloatBlock(-500, 500, seed); - Block8x8F qt = CreateRandomFloatBlock(0, 10, seed + 42); - - var unzig = ZigZag.CreateUnzigTable(); - Block8x8F zigQt = ZigZag.CreateDequantizationTable(ref qt); - - Block8x8F expected = original; - Block8x8F actual = original; - - ReferenceImplementations.DequantizeBlock(&expected, &qt, unzig.Data); - - actual.MultiplyInPlace(ref zigQt); - - this.CompareBlocks(expected, actual, 0); - } - [Fact] public void AddToAllInPlace() { @@ -462,7 +409,7 @@ public void LoadFromUInt16Scalar() short[] data = Create8x8ShortData(); - var source = new Block8x8(data); + var source = Block8x8.Load(data); Block8x8F dest = default; dest.LoadFromInt16Scalar(ref source); @@ -483,7 +430,7 @@ public void LoadFromUInt16ExtendedAvx2() short[] data = Create8x8ShortData(); - var source = new Block8x8(data); + var source = Block8x8.Load(data); Block8x8F dest = default; dest.LoadFromInt16ExtendedAvx2(ref source); diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs index 9195f09157..3737cce804 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs @@ -1,9 +1,10 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils; - +using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; using Xunit.Abstractions; @@ -22,7 +23,7 @@ public void Construct_And_Indexer_Get() { short[] data = Create8x8ShortData(); - var block = new Block8x8(data); + var block = Block8x8.Load(data); for (int i = 0; i < Block8x8.Size; i++) { @@ -43,32 +44,12 @@ public void Indexer_Set() Assert.Equal(42, block[42]); } - [Fact] - public unsafe void Indexer_GetScalarAt_SetScalarAt() - { - int sum; - var block = default(Block8x8); - - for (int i = 0; i < Block8x8.Size; i++) - { - Block8x8.SetScalarAt(&block, i, (short)i); - } - - sum = 0; - for (int i = 0; i < Block8x8.Size; i++) - { - sum += Block8x8.GetScalarAt(&block, i); - } - - Assert.Equal(sum, 64 * 63 / 2); - } - [Fact] public void AsFloatBlock() { short[] data = Create8x8ShortData(); - var source = new Block8x8(data); + var source = Block8x8.Load(data); Block8x8F dest = source.AsFloatBlock(); @@ -82,7 +63,7 @@ public void AsFloatBlock() public void ToArray() { short[] data = Create8x8ShortData(); - var block = new Block8x8(data); + var block = Block8x8.Load(data); short[] result = block.ToArray(); @@ -93,8 +74,8 @@ public void ToArray() public void Equality_WhenTrue() { short[] data = Create8x8ShortData(); - var block1 = new Block8x8(data); - var block2 = new Block8x8(data); + var block1 = Block8x8.Load(data); + var block2 = Block8x8.Load(data); block1[0] = 42; block2[0] = 42; @@ -107,8 +88,8 @@ public void Equality_WhenTrue() public void Equality_WhenFalse() { short[] data = Create8x8ShortData(); - var block1 = new Block8x8(data); - var block2 = new Block8x8(data); + var block1 = Block8x8.Load(data); + var block2 = Block8x8.Load(data); block1[0] = 42; block2[0] = 666; @@ -131,8 +112,8 @@ public void IndexerXY() public void TotalDifference() { short[] data = Create8x8ShortData(); - var block1 = new Block8x8(data); - var block2 = new Block8x8(data); + var block1 = Block8x8.Load(data); + var block2 = Block8x8.Load(data); block2[10] += 7; block2[63] += 8; @@ -141,5 +122,159 @@ public void TotalDifference() Assert.Equal(15, d); } + + [Fact] + public void GetLastNonZeroIndex_AllZero() + { + static void RunTest() + { + Block8x8 data = default; + + nint expected = -1; + + nint actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Fact] + public void GetLastNonZeroIndex_AllNonZero() + { + static void RunTest() + { + Block8x8 data = default; + for (int i = 0; i < Block8x8.Size; i++) + { + data[i] = 10; + } + + nint expected = Block8x8.Size - 1; + + nint actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Theory] + [InlineData(1)] + [InlineData(2)] + public void GetLastNonZeroIndex_RandomFilledSingle(int seed) + { + static void RunTest(string seedSerialized) + { + int seed = FeatureTestRunner.Deserialize(seedSerialized); + var rng = new Random(seed); + + for (int i = 0; i < 1000; i++) + { + Block8x8 data = default; + + int setIndex = rng.Next(1, Block8x8.Size); + data[setIndex] = (short)rng.Next(-2000, 2000); + + nint expected = setIndex; + + nint actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + seed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Theory] + [InlineData(1)] + [InlineData(2)] + public void GetLastNonZeroIndex_RandomFilledPartially(int seed) + { + static void RunTest(string seedSerialized) + { + int seed = FeatureTestRunner.Deserialize(seedSerialized); + var rng = new Random(seed); + + for (int i = 0; i < 1000; i++) + { + Block8x8 data = default; + + int lastIndex = rng.Next(1, Block8x8.Size); + short fillValue = (short)rng.Next(-2000, 2000); + for (int dataIndex = 0; dataIndex <= lastIndex; dataIndex++) + { + data[dataIndex] = fillValue; + } + + int expected = lastIndex; + + nint actual = data.GetLastNonZeroIndex(); + + Assert.Equal(expected, actual); + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + seed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } + + [Theory] + [InlineData(1)] + [InlineData(2)] + public void GetLastNonZeroIndex_RandomFilledFragmented(int seed) + { + static void RunTest(string seedSerialized) + { + int seed = FeatureTestRunner.Deserialize(seedSerialized); + var rng = new Random(seed); + + for (int i = 0; i < 1000; i++) + { + Block8x8 data = default; + + short fillValue = (short)rng.Next(-2000, 2000); + + // first filled chunk + int firstChunkStart = rng.Next(0, Block8x8.Size / 2); + int firstChunkEnd = rng.Next(firstChunkStart, Block8x8.Size / 2); + for (int dataIdx = firstChunkStart; dataIdx <= firstChunkEnd; dataIdx++) + { + data[dataIdx] = fillValue; + } + + // second filled chunk, there might be a spot with zero(s) between first and second chunk + int secondChunkStart = rng.Next(firstChunkEnd, Block8x8.Size); + int secondChunkEnd = rng.Next(secondChunkStart, Block8x8.Size); + for (int dataIdx = secondChunkStart; dataIdx <= secondChunkEnd; dataIdx++) + { + data[dataIdx] = fillValue; + } + + int expected = secondChunkEnd; + + nint actual = data.GetLastNonZeroIndex(); + + Assert.True(expected == actual, $"Expected: {expected}\nActual: {actual}\nInput matrix: {data}"); + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + seed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index d49a6498cd..b4d3769d74 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -33,15 +33,14 @@ public void LLM_TransformIDCT_CompareToNonOptimized(int seed) { float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); - var source = Block8x8F.Load(sourceArray); + var srcBlock = Block8x8F.Load(sourceArray); - Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source); + Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref srcBlock); var temp = default(Block8x8F); - var actual = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); - this.CompareBlocks(expected, actual, 1f); + this.CompareBlocks(expected, srcBlock, 1f); } [Theory] @@ -52,15 +51,14 @@ public void LLM_TransformIDCT_CompareToAccurate(int seed) { float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); - var source = Block8x8F.Load(sourceArray); + var srcBlock = Block8x8F.Load(sourceArray); - Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source); + Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref srcBlock); var temp = default(Block8x8F); - var actual = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp); - this.CompareBlocks(expected, actual, 1f); + this.CompareBlocks(expected, srcBlock, 1f); } // Inverse transform @@ -120,24 +118,18 @@ public void IDCT8x4_RightPart(int seed) public void IDCT8x8_Avx(int seed) { #if SUPPORTS_RUNTIME_INTRINSICS - var skip = !Avx.IsSupported; -#else - var skip = true; -#endif - - if (skip) + if (!Avx.IsSupported) { this.Output.WriteLine("No AVX present, skipping test!"); - return; } Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); + Block8x8F srcBlock = default; srcBlock.LoadFrom(src); - var destBlock = default(Block8x8F); + Block8x8F destBlock = default; - var expectedDest = new float[64]; + float[] expectedDest = new float[64]; // reference, left part ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest); @@ -148,10 +140,11 @@ public void IDCT8x8_Avx(int seed) // testee, whole 8x8 FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock); - var actualDest = new float[64]; + float[] actualDest = new float[64]; destBlock.ScaledCopyTo(actualDest); Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); +#endif } [Theory] @@ -167,8 +160,6 @@ static void RunTest(string serialized) var srcBlock = default(Block8x8F); srcBlock.LoadFrom(src); - var destBlock = default(Block8x8F); - var expectedDest = new float[64]; var temp1 = new float[64]; var temp2 = default(Block8x8F); @@ -177,10 +168,10 @@ static void RunTest(string serialized) ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1); // testee - FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2); + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref temp2); var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); + srcBlock.ScaledCopyTo(actualDest); Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); } @@ -198,95 +189,8 @@ static void RunTest(string serialized) } // Forward transform - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x4_LeftPart(int seed) - { - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest); - - // testee - FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x4_RightPart(int seed) - { - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); - - // testee - FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void FDCT8x8_Avx(int seed) - { -#if SUPPORTS_RUNTIME_INTRINSICS - var skip = !Avx.IsSupported; -#else - var skip = true; -#endif - if (skip) - { - this.Output.WriteLine("No AVX present, skipping test!"); - return; - } - - Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); - - var expectedDest = new float[64]; - - // reference, left part - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest); - - // reference, right part - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); - - // testee, whole 8x8 - FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock); - - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); - - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); - } - + // This test covers entire FDCT conversions chain + // This test checks all implementations: intrinsic and scalar fallback [Theory] [InlineData(1)] [InlineData(2)] @@ -297,37 +201,38 @@ static void RunTest(string serialized) int seed = FeatureTestRunner.Deserialize(serialized); Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); - var srcBlock = default(Block8x8F); - srcBlock.LoadFrom(src); - - var destBlock = default(Block8x8F); + var block = default(Block8x8F); + block.LoadFrom(src); - var expectedDest = new float[64]; - var temp1 = new float[64]; - var temp2 = default(Block8x8F); + float[] expectedDest = new float[64]; + float[] temp1 = new float[64]; // reference ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true); // testee - FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false); + // Part of the FDCT calculations is fused into the quantization step + // We must multiply transformed block with reciprocal values from FastFloatingPointDCT.ANN_DCT_reciprocalAdjustmen + FastFloatingPointDCT.TransformFDCT(ref block); + for (int i = 0; i < 64; i++) + { + block[i] = block[i] * FastFloatingPointDCT.DctReciprocalAdjustmentCoefficients[i]; + } - var actualDest = new float[64]; - destBlock.ScaledCopyTo(actualDest); + float[] actualDest = block.ToArray(); - Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); + Assert.Equal(expectedDest, actualDest, new ApproximateFloatComparer(1f)); } // 3 paths: // 1. AllowAll - call avx/fma implementation // 2. DisableFMA - call avx implementation without fma acceleration - // 3. DisableAvx - call fallback code of Vector4 implementation - // - // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result + // 3. DisableAvx - call sse implementation + // 4. DisableHWIntrinsic - call scalar fallback implementation FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX); + HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX | HwIntrinsics.DisableHWIntrinsic); } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs index b953e80b80..42f2fa0d5d 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/HuffmanScanEncoderTests.cs @@ -85,157 +85,5 @@ public void GetHuffmanEncodingLength_Random(int seed) Assert.Equal(expected, actual); } } - - [Fact] - public void GetLastValuableElementIndex_AllZero() - { - static void RunTest() - { - Block8x8F data = default; - - int expectedLessThan = 1; - - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); - - Assert.True(actual < expectedLessThan); - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } - - [Fact] - public void GetLastValuableElementIndex_AllNonZero() - { - static void RunTest() - { - Block8x8F data = default; - for (int i = 0; i < Block8x8F.Size; i++) - { - data[i] = 10; - } - - int expected = Block8x8F.Size - 1; - - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); - - Assert.Equal(expected, actual); - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void GetLastValuableElementIndex_RandomFilledSingle(int seed) - { - static void RunTest(string seedSerialized) - { - int seed = FeatureTestRunner.Deserialize(seedSerialized); - var rng = new Random(seed); - - for (int i = 0; i < 1000; i++) - { - Block8x8F data = default; - - int setIndex = rng.Next(1, Block8x8F.Size); - data[setIndex] = rng.Next(); - - int expected = setIndex; - - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); - - Assert.Equal(expected, actual); - } - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void GetLastValuableElementIndex_RandomFilledPartially(int seed) - { - static void RunTest(string seedSerialized) - { - int seed = FeatureTestRunner.Deserialize(seedSerialized); - var rng = new Random(seed); - - for (int i = 0; i < 1000; i++) - { - Block8x8F data = default; - - int lastIndex = rng.Next(1, Block8x8F.Size); - int fillValue = rng.Next(); - for (int dataIndex = 0; dataIndex <= lastIndex; dataIndex++) - { - data[dataIndex] = fillValue; - } - - int expected = lastIndex; - - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); - - Assert.Equal(expected, actual); - } - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } - - [Theory] - [InlineData(1)] - [InlineData(2)] - public void GetLastValuableElementIndex_RandomFilledFragmented(int seed) - { - static void RunTest(string seedSerialized) - { - int seed = FeatureTestRunner.Deserialize(seedSerialized); - var rng = new Random(seed); - - for (int i = 0; i < 1000; i++) - { - Block8x8F data = default; - - int fillValue = rng.Next(); - - // first filled chunk - int lastIndex1 = rng.Next(1, Block8x8F.Size / 2); - for (int dataIndex = 0; dataIndex <= lastIndex1; dataIndex++) - { - data[dataIndex] = fillValue; - } - - // second filled chunk, there might be a spot with zero(s) between first and second chunk - int lastIndex2 = rng.Next(lastIndex1 + 1, Block8x8F.Size); - for (int dataIndex = 0; dataIndex <= lastIndex2; dataIndex++) - { - data[dataIndex] = fillValue; - } - - int expected = lastIndex2; - - int actual = HuffmanScanEncoder.GetLastValuableElementIndex(ref data); - - Assert.Equal(expected, actual); - } - } - - FeatureTestRunner.RunWithHwIntrinsicsFeature( - RunTest, - seed, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); - } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs index 03f7020c09..4505ef5386 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/QuantizationTests.cs @@ -21,7 +21,9 @@ public void QualityEstimationFromStandardEncoderTables_Luminance() Block8x8F table = JpegQuantization.ScaleLuminanceTable(quality); int estimatedQuality = JpegQuantization.EstimateLuminanceQuality(ref table); - Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate luminance quality for standard table at quality level {quality}"); + Assert.True( + quality.Equals(estimatedQuality), + $"Failed to estimate luminance quality for standard table at quality level {quality}"); } } @@ -35,7 +37,9 @@ public void QualityEstimationFromStandardEncoderTables_Chrominance() Block8x8F table = JpegQuantization.ScaleChrominanceTable(quality); int estimatedQuality = JpegQuantization.EstimateChrominanceQuality(ref table); - Assert.True(quality.Equals(estimatedQuality), $"Failed to estimate chrominance quality for standard table at quality level {quality}"); + Assert.True( + quality.Equals(estimatedQuality), + $"Failed to estimate chrominance quality for standard table at quality level {quality}"); } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs index ccb7f6f1eb..1cf9bc4aef 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/JpegFixture.cs @@ -190,6 +190,38 @@ internal void CompareBlocks(Span a, Span b, float tolerance) Assert.False(failed); } + internal static bool CompareBlocks(Block8x8 a, Block8x8 b, int tolerance, out int diff) + { + bool res = CompareBlocks(a.AsFloatBlock(), b.AsFloatBlock(), tolerance + 1e-5f, out float fdiff); + diff = (int)fdiff; + return res; + } + + internal static bool CompareBlocks(Block8x8F a, Block8x8F b, float tolerance, out float diff) => + CompareBlocks(a.ToArray(), b.ToArray(), tolerance, out diff); + + internal static bool CompareBlocks(Span a, Span b, float tolerance, out float diff) + { + var comparer = new ApproximateFloatComparer(tolerance); + bool failed = false; + + diff = 0; + + for (int i = 0; i < 64; i++) + { + float expected = a[i]; + float actual = b[i]; + diff += Math.Abs(expected - actual); + + if (!comparer.Equals(expected, actual)) + { + failed = true; + } + } + + return !failed; + } + internal static JpegDecoderCore ParseJpegStream(string testFileName, bool metaDataOnly = false) { byte[] bytes = TestFile.Create(testFileName).Bytes; diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs index edb8d457b7..560238edb1 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs @@ -53,7 +53,7 @@ internal void MakeBlock(short[] data, int y, int x) { this.MinVal = Math.Min(this.MinVal, data.Min()); this.MaxVal = Math.Max(this.MaxVal, data.Max()); - this.SpectralBlocks[x, y] = new Block8x8(data); + this.SpectralBlocks[x, y] = Block8x8.Load(data); } public void LoadSpectralStride(Buffer2D data, int strideIndex) diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs index 2c673f30ee..aa98a7379b 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Utils/ReferenceImplementations.cs @@ -15,18 +15,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Jpg.Utils /// internal static partial class ReferenceImplementations { - public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr) + public static void DequantizeBlock(ref Block8x8F block, ref Block8x8F qt, ReadOnlySpan zigzag) { - float* b = (float*)blockPtr; - float* qtp = (float*)qtPtr; - for (int qtIndex = 0; qtIndex < Block8x8F.Size; qtIndex++) + for (int i = 0; i < Block8x8F.Size; i++) { - byte i = unzigPtr[qtIndex]; - float* unzigPos = b + i; - - float val = *unzigPos; - val *= qtp[qtIndex]; - *unzigPos = val; + int zig = zigzag[i]; + block[zig] *= qt[i]; } } @@ -101,42 +95,18 @@ internal static unsafe void CopyColorsTo(ref Block8x8F block, Span buffer, /// /// Reference implementation to test . - /// Rounding is done used an integer-based algorithm defined in . /// - /// The input block - /// The destination block of integers - /// The quantization table - /// Pointer to - public static unsafe void QuantizeRational(Block8x8F* src, int* dest, Block8x8F* qt, byte* unzigPtr) + /// The input block. + /// The destination block of 16bit integers. + /// The quantization table. + /// Zig-Zag index sequence span. + public static void Quantize(ref Block8x8F src, ref Block8x8 dest, ref Block8x8F qt, ReadOnlySpan zigzag) { - float* s = (float*)src; - float* q = (float*)qt; - - for (int zig = 0; zig < Block8x8F.Size; zig++) + for (int i = 0; i < Block8x8F.Size; i++) { - int a = (int)s[unzigPtr[zig]]; - int b = (int)q[zig]; - - int val = RationalRound(a, b); - dest[zig] = val; + int zig = zigzag[i]; + dest[i] = (short)Math.Round(src[zig] / qt[zig], MidpointRounding.AwayFromZero); } } - - /// - /// Rounds a rational number defined as dividend/divisor into an integer. - /// - /// The dividend. - /// The divisor. - /// The rounded value. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int RationalRound(int dividend, int divisor) - { - if (dividend >= 0) - { - return (dividend + (divisor >> 1)) / divisor; - } - - return -((-dividend + (divisor >> 1)) / divisor); - } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs index e03cf9958f..39046438a8 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/ZigZagTests.cs @@ -13,8 +13,7 @@ public class ZigZagTests public void ZigZagCanHandleAllPossibleCoefficients() { // Mimic the behaviour of the huffman scan decoder using all possible byte values - var block = new short[64]; - var zigzag = ZigZag.CreateUnzigTable(); + short[] block = new short[64]; for (int h = 0; h < 255; h++) { @@ -27,7 +26,7 @@ public void ZigZagCanHandleAllPossibleCoefficients() if (s != 0) { i += r; - block[zigzag[i++]] = (short)s; + block[ZigZag.ZigZagOrder[i++]] = (short)s; } else { diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs index fa0f02ca1f..0d2f3fcefb 100644 --- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs +++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs @@ -301,6 +301,52 @@ public static void RunWithHwIntrinsicsFeature( } } + /// + /// Runs the given test within an environment + /// where the given features. + /// + /// The test action to run. + /// The value to pass as a parameter #0 to the test action. + /// The value to pass as a parameter #1 to the test action. + /// The intrinsics features. + public static void RunWithHwIntrinsicsFeature( + Action action, + T arg0, + T arg1, + HwIntrinsics intrinsics) + where T : IConvertible + { + if (!RemoteExecutor.IsSupported) + { + return; + } + + foreach (KeyValuePair intrinsic in intrinsics.ToFeatureKeyValueCollection()) + { + var processStartInfo = new ProcessStartInfo(); + if (intrinsic.Key != HwIntrinsics.AllowAll) + { + processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; + + RemoteExecutor.Invoke( + action, + arg0.ToString(), + arg1.ToString(), + new RemoteInvokeOptions + { + StartInfo = processStartInfo + }) + .Dispose(); + } + else + { + // Since we are running using the default architecture there is no + // point creating the overhead of running the action in a separate process. + action(arg0.ToString(), arg1.ToString()); + } + } + } + internal static Dictionary ToFeatureKeyValueCollection(this HwIntrinsics intrinsics) { // Loop through and translate the given values into COMPlus equivaluents