diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
index bc6036903b..9d49b8c45f 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
@@ -2,17 +2,22 @@
// Licensed under the Apache License, Version 2.0.
using System;
-using System.Diagnostics;
+using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
using System.Text;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
///
- /// Represents a Jpeg block with coefficients.
+ /// 8x8 matrix of coefficients.
///
// ReSharper disable once InconsistentNaming
+ [StructLayout(LayoutKind.Explicit)]
internal unsafe struct Block8x8 : IEquatable
{
///
@@ -20,24 +25,44 @@ internal unsafe struct Block8x8 : IEquatable
///
public const int Size = 64;
+#pragma warning disable IDE0051 // Remove unused private member
///
- /// A fixed size buffer holding the values.
- /// See:
- /// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/unsafe-code-pointers/fixed-size-buffers
- ///
+ /// A placeholder buffer so the actual struct occupies exactly 64 * 2 bytes.
///
+ ///
+ /// This is not used directly in the code.
+ ///
+ [FieldOffset(0)]
private fixed short data[Size];
-
- ///
- /// Initializes a new instance of the struct.
- ///
- /// A of coefficients
- public Block8x8(Span coefficients)
- {
- ref byte selfRef = ref Unsafe.As(ref this);
- ref byte sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(coefficients));
- Unsafe.CopyBlock(ref selfRef, ref sourceRef, Size * sizeof(short));
- }
+#pragma warning restore IDE0051
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ [FieldOffset(0)]
+ public Vector128 V0;
+ [FieldOffset(16)]
+ public Vector128 V1;
+ [FieldOffset(32)]
+ public Vector128 V2;
+ [FieldOffset(48)]
+ public Vector128 V3;
+ [FieldOffset(64)]
+ public Vector128 V4;
+ [FieldOffset(80)]
+ public Vector128 V5;
+ [FieldOffset(96)]
+ public Vector128 V6;
+ [FieldOffset(112)]
+ public Vector128 V7;
+
+ [FieldOffset(0)]
+ public Vector256 V01;
+ [FieldOffset(32)]
+ public Vector256 V23;
+ [FieldOffset(64)]
+ public Vector256 V45;
+ [FieldOffset(96)]
+ public Vector256 V67;
+#endif
///
/// Gets or sets a value at the given index
@@ -49,7 +74,8 @@ public short this[int idx]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
- GuardBlockIndex(idx);
+ DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
+
ref short selfRef = ref Unsafe.As(ref this);
return Unsafe.Add(ref selfRef, idx);
}
@@ -57,7 +83,8 @@ public short this[int idx]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
set
{
- GuardBlockIndex(idx);
+ DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
+
ref short selfRef = ref Unsafe.As(ref this);
Unsafe.Add(ref selfRef, idx) = value;
}
@@ -75,15 +102,9 @@ public short this[int idx]
set => this[(y * 8) + x] = value;
}
- public static bool operator ==(Block8x8 left, Block8x8 right)
- {
- return left.Equals(right);
- }
+ public static bool operator ==(Block8x8 left, Block8x8 right) => left.Equals(right);
- public static bool operator !=(Block8x8 left, Block8x8 right)
- {
- return !left.Equals(right);
- }
+ public static bool operator !=(Block8x8 left, Block8x8 right) => !left.Equals(right);
///
/// Multiply all elements by a given
@@ -149,34 +170,11 @@ public short this[int idx]
return result;
}
- ///
- /// Pointer-based "Indexer" (getter part)
- ///
- /// Block pointer
- /// Index
- /// The scaleVec value at the specified index
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static short GetScalarAt(Block8x8* blockPtr, int idx)
- {
- GuardBlockIndex(idx);
-
- short* fp = blockPtr->data;
- return fp[idx];
- }
-
- ///
- /// Pointer-based "Indexer" (setter part)
- ///
- /// Block pointer
- /// Index
- /// Value
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static void SetScalarAt(Block8x8* blockPtr, int idx, short value)
+ public static Block8x8 Load(Span data)
{
- GuardBlockIndex(idx);
-
- short* fp = blockPtr->data;
- fp[idx] = value;
+ Unsafe.SkipInit(out Block8x8 result);
+ result.LoadFrom(data);
+ return result;
}
///
@@ -194,7 +192,7 @@ public Block8x8F AsFloatBlock()
///
public short[] ToArray()
{
- var result = new short[Size];
+ short[] result = new short[Size];
this.CopyTo(result);
return result;
}
@@ -206,7 +204,7 @@ public void CopyTo(Span destination)
{
ref byte selfRef = ref Unsafe.As(ref this);
ref byte destRef = ref MemoryMarshal.GetReference(MemoryMarshal.Cast(destination));
- Unsafe.CopyBlock(ref destRef, ref selfRef, Size * sizeof(short));
+ Unsafe.CopyBlockUnaligned(ref destRef, ref selfRef, Size * sizeof(short));
}
///
@@ -220,6 +218,19 @@ public void CopyTo(Span destination)
}
}
+ ///
+ /// Load raw 16bit integers from source.
+ ///
+ /// Source
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void LoadFrom(Span source)
+ {
+ ref byte sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
+ ref byte destRef = ref Unsafe.As(ref this);
+
+ Unsafe.CopyBlockUnaligned(ref destRef, ref sourceRef, Size * sizeof(short));
+ }
+
///
/// Cast and copy -s from the beginning of 'source' span.
///
@@ -231,13 +242,6 @@ public void LoadFrom(Span source)
}
}
- [Conditional("DEBUG")]
- private static void GuardBlockIndex(int idx)
- {
- DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
- DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
- }
-
///
public override string ToString()
{
@@ -271,15 +275,66 @@ public bool Equals(Block8x8 other)
}
///
- public override bool Equals(object obj)
- {
- return obj is Block8x8 other && this.Equals(other);
- }
+ public override bool Equals(object obj) => obj is Block8x8 other && this.Equals(other);
///
- public override int GetHashCode()
+ public override int GetHashCode() => (this[0] * 31) + this[1];
+
+ ///
+ /// Returns index of the last non-zero element in given matrix.
+ ///
+ ///
+ /// Index of the last non-zero element. Returns -1 if all elements are equal to zero.
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public nint GetLastNonZeroIndex()
{
- return (this[0] * 31) + this[1];
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported)
+ {
+ const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+ Vector256 zero16 = Vector256.Zero;
+
+ ref Vector256 mcuStride = ref Unsafe.As>(ref this);
+
+ for (nint i = 3; i >= 0; i--)
+ {
+ int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Unsafe.Add(ref mcuStride, i), zero16).AsByte());
+
+ if (areEqual != equalityMask)
+ {
+ // Each 2 bits represents comparison operation for each 2-byte element in input vectors
+ // LSB represents first element in the stride
+ // MSB represents last element in the stride
+ // lzcnt operation would calculate number of zero numbers at the end
+
+ // Given mask is not actually suitable for lzcnt as 1's represent zero elements and 0's represent non-zero elements
+ // So we need to invert it
+ int lzcnt = BitOperations.LeadingZeroCount(~(uint)areEqual);
+
+ // As input number is represented by 2 bits in the mask, we need to divide lzcnt result by 2
+ // to get the exact number of zero elements in the stride
+ int strideRelativeIndex = 15 - (lzcnt / 2);
+ return (i * 16) + strideRelativeIndex;
+ }
+ }
+
+ return -1;
+ }
+ else
+#endif
+ {
+ nint index = Size - 1;
+ ref short elemRef = ref Unsafe.As(ref this);
+
+ while (index >= 0 && Unsafe.Add(ref elemRef, index) == 0)
+ {
+ index--;
+ }
+
+ return index;
+ }
}
///
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
new file mode 100644
index 0000000000..0971ccdca0
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
@@ -0,0 +1,149 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+ internal partial struct Block8x8F
+ {
+ ///
+ /// A number of rows of 8 scalar coefficients each in
+ ///
+ public const int RowCount = 8;
+
+ [FieldOffset(0)]
+ public Vector256 V0;
+ [FieldOffset(32)]
+ public Vector256 V1;
+ [FieldOffset(64)]
+ public Vector256 V2;
+ [FieldOffset(96)]
+ public Vector256 V3;
+ [FieldOffset(128)]
+ public Vector256 V4;
+ [FieldOffset(160)]
+ public Vector256 V5;
+ [FieldOffset(192)]
+ public Vector256 V6;
+ [FieldOffset(224)]
+ public Vector256 V7;
+
+ private static readonly Vector256 MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
+
+ private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+ {
+ DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
+
+ ref Vector256 aBase = ref a.V0;
+ ref Vector256 bBase = ref b.V0;
+
+ ref Vector256 destRef = ref dest.V01;
+
+ for (nint i = 0; i < 8; i += 2)
+ {
+ Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+ Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+ Vector256 row = Avx2.PackSignedSaturate(row0, row1);
+ row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
+
+ Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row;
+ }
+ }
+
+ private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
+ {
+ DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
+
+ ref Vector128 aBase = ref Unsafe.As>(ref a);
+ ref Vector128 bBase = ref Unsafe.As>(ref b);
+
+ ref Vector128 destBase = ref Unsafe.As>(ref dest);
+
+ for (int i = 0; i < 16; i += 2)
+ {
+ Vector128 left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
+ Vector128 right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
+
+ Vector128 row = Sse2.PackSignedSaturate(left, right);
+ Unsafe.Add(ref destBase, (IntPtr)((uint)i / 2)) = row;
+ }
+ }
+
+ private void TransposeInplace_Avx()
+ {
+ // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
+ Vector256 r0 = Avx.InsertVector128(
+ this.V0,
+ Unsafe.As>(ref this.V4L),
+ 1);
+
+ Vector256 r1 = Avx.InsertVector128(
+ this.V1,
+ Unsafe.As>(ref this.V5L),
+ 1);
+
+ Vector256 r2 = Avx.InsertVector128(
+ this.V2,
+ Unsafe.As>(ref this.V6L),
+ 1);
+
+ Vector256 r3 = Avx.InsertVector128(
+ this.V3,
+ Unsafe.As>(ref this.V7L),
+ 1);
+
+ Vector256 r4 = Avx.InsertVector128(
+ Unsafe.As>(ref this.V0R).ToVector256(),
+ Unsafe.As>(ref this.V4R),
+ 1);
+
+ Vector256 r5 = Avx.InsertVector128(
+ Unsafe.As>(ref this.V1R).ToVector256(),
+ Unsafe.As>(ref this.V5R),
+ 1);
+
+ Vector256 r6 = Avx.InsertVector128(
+ Unsafe.As>(ref this.V2R).ToVector256(),
+ Unsafe.As>(ref this.V6R),
+ 1);
+
+ Vector256 r7 = Avx.InsertVector128(
+ Unsafe.As>(ref this.V3R).ToVector256(),
+ Unsafe.As>(ref this.V7R),
+ 1);
+
+ Vector256 t0 = Avx.UnpackLow(r0, r1);
+ Vector256 t2 = Avx.UnpackLow(r2, r3);
+ Vector256 v = Avx.Shuffle(t0, t2, 0x4E);
+ this.V0 = Avx.Blend(t0, v, 0xCC);
+ this.V1 = Avx.Blend(t2, v, 0x33);
+
+ Vector256 t4 = Avx.UnpackLow(r4, r5);
+ Vector256 t6 = Avx.UnpackLow(r6, r7);
+ v = Avx.Shuffle(t4, t6, 0x4E);
+ this.V4 = Avx.Blend(t4, v, 0xCC);
+ this.V5 = Avx.Blend(t6, v, 0x33);
+
+ Vector256 t1 = Avx.UnpackHigh(r0, r1);
+ Vector256 t3 = Avx.UnpackHigh(r2, r3);
+ v = Avx.Shuffle(t1, t3, 0x4E);
+ this.V2 = Avx.Blend(t1, v, 0xCC);
+ this.V3 = Avx.Blend(t3, v, 0x33);
+
+ Vector256 t5 = Avx.UnpackHigh(r4, r5);
+ Vector256 t7 = Avx.UnpackHigh(r6, r7);
+ v = Avx.Shuffle(t5, t7, 0x4E);
+ this.V6 = Avx.Blend(t5, v, 0xCC);
+ this.V7 = Avx.Blend(t7, v, 0x33);
+ }
+ }
+}
+#endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
index 23cf4ce4a9..498fe4d03b 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.ScaledCopyTo.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System.Numerics;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index f669a7ad9a..02f5a13244 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -16,7 +16,7 @@
namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
///
- /// Represents a Jpeg block with coefficients.
+ /// 8x8 matrix of coefficients.
///
[StructLayout(LayoutKind.Explicit)]
internal partial struct Block8x8F : IEquatable
@@ -66,30 +66,6 @@ internal partial struct Block8x8F : IEquatable
public Vector4 V7L;
[FieldOffset(240)]
public Vector4 V7R;
-
-#if SUPPORTS_RUNTIME_INTRINSICS
- ///
- /// A number of rows of 8 scalar coefficients each in
- ///
- public const int RowCount = 8;
-
- [FieldOffset(0)]
- public Vector256 V0;
- [FieldOffset(32)]
- public Vector256 V1;
- [FieldOffset(64)]
- public Vector256 V2;
- [FieldOffset(96)]
- public Vector256 V3;
- [FieldOffset(128)]
- public Vector256 V4;
- [FieldOffset(160)]
- public Vector256 V5;
- [FieldOffset(192)]
- public Vector256 V6;
- [FieldOffset(224)]
- public Vector256 V7;
-#endif
#pragma warning restore SA1600 // ElementsMustBeDocumented
///
@@ -102,7 +78,7 @@ public float this[int idx]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
- GuardBlockIndex(idx);
+ DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
ref float selfRef = ref Unsafe.As(ref this);
return Unsafe.Add(ref selfRef, (nint)(uint)idx);
}
@@ -110,7 +86,7 @@ public float this[int idx]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
set
{
- GuardBlockIndex(idx);
+ DebugGuard.MustBeBetweenOrEqualTo(idx, 0, Size - 1, nameof(idx));
ref float selfRef = ref Unsafe.As(ref this);
Unsafe.Add(ref selfRef, (nint)(uint)idx) = value;
}
@@ -188,13 +164,6 @@ public static Block8x8F Load(Span data)
return result;
}
- ///
- /// Fill the block with defaults (zeroes).
- ///
- [MethodImpl(InliningOptions.ShortMethod)]
- public void Clear()
- => this = default; // The cheapest way to do this in C#:
-
///
/// Load raw 32bit floating point data from source.
///
@@ -302,7 +271,7 @@ public unsafe void ScaledCopyTo(Span dest)
public float[] ToArray()
{
- var result = new float[Size];
+ float[] result = new float[Size];
this.ScaledCopyTo(result);
return result;
}
@@ -434,102 +403,37 @@ public void AddInPlace(float value)
}
///
- /// Quantize the block.
+ /// Quantize input block, apply zig-zag ordering and store result as 16bit integers.
///
- /// The block pointer.
- /// The qt pointer.
- /// Unzig pointer
- public static unsafe void DequantizeBlock(Block8x8F* blockPtr, Block8x8F* qtPtr, byte* unzigPtr)
- {
- float* b = (float*)blockPtr;
- float* qtp = (float*)qtPtr;
- for (int qtIndex = 0; qtIndex < Size; qtIndex++)
- {
- byte blockIndex = unzigPtr[qtIndex];
- float* unzigPos = b + blockIndex;
-
- float val = *unzigPos;
- val *= qtp[qtIndex];
- *unzigPos = val;
- }
- }
-
- ///
- /// Quantize 'block' into 'dest' using the 'qt' quantization table:
- /// Unzig the elements of block into dest, while dividing them by elements of qt and "pre-rounding" the values.
- /// To finish the rounding it's enough to (int)-cast these values.
- ///
- /// Source block
- /// Destination block
- /// The quantization table
- /// The 8x8 Unzig block.
- public static unsafe void Quantize(
- ref Block8x8F block,
- ref Block8x8F dest,
- ref Block8x8F qt,
- ref ZigZag unZig)
+ /// Source block.
+ /// Destination block.
+ /// The quantization table.
+ public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt)
{
- for (int zig = 0; zig < Size; zig++)
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported)
{
- dest[zig] = block[unZig[zig]];
+ MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
+ ZigZag.ApplyZigZagOrderingAvx2(ref dest);
}
-
- DivideRoundAll(ref dest, ref qt);
- }
-
- [MethodImpl(InliningOptions.ShortMethod)]
- private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Avx.IsSupported)
+ else if (Ssse3.IsSupported)
{
- var vnegOne = Vector256.Create(-1f);
- var vadd = Vector256.Create(.5F);
- var vone = Vector256.Create(1f);
-
- for (int i = 0; i < RowCount; i++)
- {
- ref Vector256 aRow = ref Unsafe.Add(ref a.V0, i);
- ref Vector256 bRow = ref Unsafe.Add(ref b.V0, i);
- Vector256 voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd);
- aRow = Avx.Add(Avx.Divide(aRow, bRow), voff);
- }
+ MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
+ ZigZag.ApplyZigZagOrderingSsse3(ref dest);
}
else
#endif
{
- a.V0L = DivideRound(a.V0L, b.V0L);
- a.V0R = DivideRound(a.V0R, b.V0R);
- a.V1L = DivideRound(a.V1L, b.V1L);
- a.V1R = DivideRound(a.V1R, b.V1R);
- a.V2L = DivideRound(a.V2L, b.V2L);
- a.V2R = DivideRound(a.V2R, b.V2R);
- a.V3L = DivideRound(a.V3L, b.V3L);
- a.V3R = DivideRound(a.V3R, b.V3R);
- a.V4L = DivideRound(a.V4L, b.V4L);
- a.V4R = DivideRound(a.V4R, b.V4R);
- a.V5L = DivideRound(a.V5L, b.V5L);
- a.V5R = DivideRound(a.V5R, b.V5R);
- a.V6L = DivideRound(a.V6L, b.V6L);
- a.V6R = DivideRound(a.V6R, b.V6R);
- a.V7L = DivideRound(a.V7L, b.V7L);
- a.V7R = DivideRound(a.V7R, b.V7R);
+ for (int i = 0; i < Size; i++)
+ {
+ int idx = ZigZag.ZigZagOrder[i];
+ float quantizedVal = block[idx] * qt[idx];
+ quantizedVal += quantizedVal < 0 ? -0.5f : 0.5f;
+ dest[i] = (short)quantizedVal;
+ }
}
}
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static Vector4 DivideRound(Vector4 dividend, Vector4 divisor)
- {
- var neg = new Vector4(-1);
- var add = new Vector4(.5F);
-
- // sign(dividend) = max(min(dividend, 1), -1)
- Vector4 sign = Numerics.Clamp(dividend, neg, Vector4.One);
-
- // AlmostRound(dividend/divisor) = dividend/divisor + 0.5*sign(dividend)
- return (dividend / divisor) + (sign * add);
- }
-
public void RoundInto(ref Block8x8 dest)
{
for (int i = 0; i < Size; i++)
@@ -627,6 +531,47 @@ public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
Unsafe.Add(ref dRef, 7) = bottom;
}
+ ///
+ /// Compares entire 8x8 block to a single scalar value.
+ ///
+ /// Value to compare to.
+ public bool EqualsToScalar(int value)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Avx2.IsSupported)
+ {
+ const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+
+ var targetVector = Vector256.Create(value);
+ ref Vector256 blockStride = ref this.V0;
+
+ for (int i = 0; i < RowCount; i++)
+ {
+ Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
+ if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
+ {
+ return false;
+ }
+ }
+
+ return true;
+ }
+#endif
+ {
+ ref float scalars = ref Unsafe.As(ref this);
+
+ for (int i = 0; i < Size; i++)
+ {
+ if ((int)Unsafe.Add(ref scalars, i) != value)
+ {
+ return false;
+ }
+ }
+
+ return true;
+ }
+ }
+
///
public bool Equals(Block8x8F other)
=> this.V0L == other.V0L
@@ -663,213 +608,89 @@ public override string ToString()
return sb.ToString();
}
- [MethodImpl(InliningOptions.ShortMethod)]
- private static Vector NormalizeAndRound(Vector row, Vector off, Vector max)
- {
- row += off;
- row = Vector.Max(row, Vector.Zero);
- row = Vector.Min(row, max);
- return row.FastRound();
- }
-
- [Conditional("DEBUG")]
- private static void GuardBlockIndex(int idx)
- {
- DebugGuard.MustBeLessThan(idx, Size, nameof(idx));
- DebugGuard.MustBeGreaterThanOrEqualTo(idx, 0, nameof(idx));
- }
-
///
- /// Transpose the block into the destination block.
+ /// Transpose the block inplace.
///
- /// The destination block
[MethodImpl(InliningOptions.ShortMethod)]
- public void TransposeInto(ref Block8x8F d)
+ public void TransposeInplace()
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
- // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
- Vector256 r0 = Avx.InsertVector128(
- Unsafe.As>(ref this.V0L).ToVector256(),
- Unsafe.As>(ref this.V4L),
- 1);
-
- Vector256 r1 = Avx.InsertVector128(
- Unsafe.As>(ref this.V1L).ToVector256(),
- Unsafe.As>(ref this.V5L),
- 1);
-
- Vector256 r2 = Avx.InsertVector128(
- Unsafe.As>(ref this.V2L).ToVector256(),
- Unsafe.As>(ref this.V6L),
- 1);
-
- Vector256 r3 = Avx.InsertVector128(
- Unsafe.As>(ref this.V3L).ToVector256(),
- Unsafe.As>(ref this.V7L),
- 1);
-
- Vector256 r4 = Avx.InsertVector128(
- Unsafe.As>(ref this.V0R).ToVector256(),
- Unsafe.As>(ref this.V4R),
- 1);
-
- Vector256 r5 = Avx.InsertVector128(
- Unsafe.As>(ref this.V1R).ToVector256(),
- Unsafe.As>(ref this.V5R),
- 1);
-
- Vector256 r6 = Avx.InsertVector128(
- Unsafe.As>(ref this.V2R).ToVector256(),
- Unsafe.As>(ref this.V6R),
- 1);
-
- Vector256 r7 = Avx.InsertVector128(
- Unsafe.As>(ref this.V3R).ToVector256(),
- Unsafe.As>(ref this.V7R),
- 1);
-
- Vector256 t0 = Avx.UnpackLow(r0, r1);
- Vector256 t2 = Avx.UnpackLow(r2, r3);
- Vector256 v = Avx.Shuffle(t0, t2, 0x4E);
- d.V0 = Avx.Blend(t0, v, 0xCC);
- d.V1 = Avx.Blend(t2, v, 0x33);
-
- Vector256 t4 = Avx.UnpackLow(r4, r5);
- Vector256 t6 = Avx.UnpackLow(r6, r7);
- v = Avx.Shuffle(t4, t6, 0x4E);
- d.V4 = Avx.Blend(t4, v, 0xCC);
- d.V5 = Avx.Blend(t6, v, 0x33);
-
- Vector256 t1 = Avx.UnpackHigh(r0, r1);
- Vector256 t3 = Avx.UnpackHigh(r2, r3);
- v = Avx.Shuffle(t1, t3, 0x4E);
- d.V2 = Avx.Blend(t1, v, 0xCC);
- d.V3 = Avx.Blend(t3, v, 0x33);
-
- Vector256 t5 = Avx.UnpackHigh(r4, r5);
- Vector256 t7 = Avx.UnpackHigh(r6, r7);
- v = Avx.Shuffle(t5, t7, 0x4E);
- d.V6 = Avx.Blend(t5, v, 0xCC);
- d.V7 = Avx.Blend(t7, v, 0x33);
+ this.TransposeInplace_Avx();
}
else
#endif
{
- d.V0L.X = this.V0L.X;
- d.V1L.X = this.V0L.Y;
- d.V2L.X = this.V0L.Z;
- d.V3L.X = this.V0L.W;
- d.V4L.X = this.V0R.X;
- d.V5L.X = this.V0R.Y;
- d.V6L.X = this.V0R.Z;
- d.V7L.X = this.V0R.W;
-
- d.V0L.Y = this.V1L.X;
- d.V1L.Y = this.V1L.Y;
- d.V2L.Y = this.V1L.Z;
- d.V3L.Y = this.V1L.W;
- d.V4L.Y = this.V1R.X;
- d.V5L.Y = this.V1R.Y;
- d.V6L.Y = this.V1R.Z;
- d.V7L.Y = this.V1R.W;
-
- d.V0L.Z = this.V2L.X;
- d.V1L.Z = this.V2L.Y;
- d.V2L.Z = this.V2L.Z;
- d.V3L.Z = this.V2L.W;
- d.V4L.Z = this.V2R.X;
- d.V5L.Z = this.V2R.Y;
- d.V6L.Z = this.V2R.Z;
- d.V7L.Z = this.V2R.W;
-
- d.V0L.W = this.V3L.X;
- d.V1L.W = this.V3L.Y;
- d.V2L.W = this.V3L.Z;
- d.V3L.W = this.V3L.W;
- d.V4L.W = this.V3R.X;
- d.V5L.W = this.V3R.Y;
- d.V6L.W = this.V3R.Z;
- d.V7L.W = this.V3R.W;
-
- d.V0R.X = this.V4L.X;
- d.V1R.X = this.V4L.Y;
- d.V2R.X = this.V4L.Z;
- d.V3R.X = this.V4L.W;
- d.V4R.X = this.V4R.X;
- d.V5R.X = this.V4R.Y;
- d.V6R.X = this.V4R.Z;
- d.V7R.X = this.V4R.W;
-
- d.V0R.Y = this.V5L.X;
- d.V1R.Y = this.V5L.Y;
- d.V2R.Y = this.V5L.Z;
- d.V3R.Y = this.V5L.W;
- d.V4R.Y = this.V5R.X;
- d.V5R.Y = this.V5R.Y;
- d.V6R.Y = this.V5R.Z;
- d.V7R.Y = this.V5R.W;
-
- d.V0R.Z = this.V6L.X;
- d.V1R.Z = this.V6L.Y;
- d.V2R.Z = this.V6L.Z;
- d.V3R.Z = this.V6L.W;
- d.V4R.Z = this.V6R.X;
- d.V5R.Z = this.V6R.Y;
- d.V6R.Z = this.V6R.Z;
- d.V7R.Z = this.V6R.W;
-
- d.V0R.W = this.V7L.X;
- d.V1R.W = this.V7L.Y;
- d.V2R.W = this.V7L.Z;
- d.V3R.W = this.V7L.W;
- d.V4R.W = this.V7R.X;
- d.V5R.W = this.V7R.Y;
- d.V6R.W = this.V7R.Z;
- d.V7R.W = this.V7R.W;
+ this.TransposeInplace_Scalar();
}
}
///
- /// Compares entire 8x8 block to a single scalar value.
+ /// Scalar inplace transpose implementation for
///
- /// Value to compare to.
- public bool EqualsToScalar(int value)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Avx2.IsSupported)
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private void TransposeInplace_Scalar()
+ {
+ ref float elemRef = ref Unsafe.As(ref this);
+
+ // row #0
+ Swap(ref Unsafe.Add(ref elemRef, 1), ref Unsafe.Add(ref elemRef, 8));
+ Swap(ref Unsafe.Add(ref elemRef, 2), ref Unsafe.Add(ref elemRef, 16));
+ Swap(ref Unsafe.Add(ref elemRef, 3), ref Unsafe.Add(ref elemRef, 24));
+ Swap(ref Unsafe.Add(ref elemRef, 4), ref Unsafe.Add(ref elemRef, 32));
+ Swap(ref Unsafe.Add(ref elemRef, 5), ref Unsafe.Add(ref elemRef, 40));
+ Swap(ref Unsafe.Add(ref elemRef, 6), ref Unsafe.Add(ref elemRef, 48));
+ Swap(ref Unsafe.Add(ref elemRef, 7), ref Unsafe.Add(ref elemRef, 56));
+
+ // row #1
+ Swap(ref Unsafe.Add(ref elemRef, 10), ref Unsafe.Add(ref elemRef, 17));
+ Swap(ref Unsafe.Add(ref elemRef, 11), ref Unsafe.Add(ref elemRef, 25));
+ Swap(ref Unsafe.Add(ref elemRef, 12), ref Unsafe.Add(ref elemRef, 33));
+ Swap(ref Unsafe.Add(ref elemRef, 13), ref Unsafe.Add(ref elemRef, 41));
+ Swap(ref Unsafe.Add(ref elemRef, 14), ref Unsafe.Add(ref elemRef, 49));
+ Swap(ref Unsafe.Add(ref elemRef, 15), ref Unsafe.Add(ref elemRef, 57));
+
+ // row #2
+ Swap(ref Unsafe.Add(ref elemRef, 19), ref Unsafe.Add(ref elemRef, 26));
+ Swap(ref Unsafe.Add(ref elemRef, 20), ref Unsafe.Add(ref elemRef, 34));
+ Swap(ref Unsafe.Add(ref elemRef, 21), ref Unsafe.Add(ref elemRef, 42));
+ Swap(ref Unsafe.Add(ref elemRef, 22), ref Unsafe.Add(ref elemRef, 50));
+ Swap(ref Unsafe.Add(ref elemRef, 23), ref Unsafe.Add(ref elemRef, 58));
+
+ // row #3
+ Swap(ref Unsafe.Add(ref elemRef, 28), ref Unsafe.Add(ref elemRef, 35));
+ Swap(ref Unsafe.Add(ref elemRef, 29), ref Unsafe.Add(ref elemRef, 43));
+ Swap(ref Unsafe.Add(ref elemRef, 30), ref Unsafe.Add(ref elemRef, 51));
+ Swap(ref Unsafe.Add(ref elemRef, 31), ref Unsafe.Add(ref elemRef, 59));
+
+ // row #4
+ Swap(ref Unsafe.Add(ref elemRef, 37), ref Unsafe.Add(ref elemRef, 44));
+ Swap(ref Unsafe.Add(ref elemRef, 38), ref Unsafe.Add(ref elemRef, 52));
+ Swap(ref Unsafe.Add(ref elemRef, 39), ref Unsafe.Add(ref elemRef, 60));
+
+ // row #5
+ Swap(ref Unsafe.Add(ref elemRef, 46), ref Unsafe.Add(ref elemRef, 53));
+ Swap(ref Unsafe.Add(ref elemRef, 47), ref Unsafe.Add(ref elemRef, 61));
+
+ // row #6
+ Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
+
+ static void Swap(ref float a, ref float b)
{
- const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
-
- var targetVector = Vector256.Create(value);
- ref Vector256 blockStride = ref this.V0;
-
- for (int i = 0; i < RowCount; i++)
- {
- Vector256 areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
- if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
- {
- return false;
- }
- }
-
- return true;
+ float tmp = a;
+ a = b;
+ b = tmp;
}
-#endif
- {
- ref float scalars = ref Unsafe.As(ref this);
-
- for (int i = 0; i < Size; i++)
- {
- if ((int)Unsafe.Add(ref scalars, i) != value)
- {
- return false;
- }
- }
+ }
- return true;
- }
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static Vector NormalizeAndRound(Vector row, Vector off, Vector max)
+ {
+ row += off;
+ row = Vector.Max(row, Vector.Zero);
+ row = Vector.Min(row, max);
+ return row.FastRound();
}
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
index b5a51c5a4a..bc9a53ea04 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/HuffmanScanDecoder.cs
@@ -58,11 +58,6 @@ internal class HuffmanScanDecoder
///
private readonly HuffmanTable[] acHuffmanTables;
- ///
- /// The unzig data.
- ///
- private ZigZag dctZigZag;
-
private HuffmanScanBuffer scanBuffer;
private readonly SpectralConverter spectralConverter;
@@ -80,7 +75,6 @@ public HuffmanScanDecoder(
SpectralConverter converter,
CancellationToken cancellationToken)
{
- this.dctZigZag = ZigZag.CreateUnzigTable();
this.stream = stream;
this.spectralConverter = converter;
this.cancellationToken = cancellationToken;
@@ -483,7 +477,6 @@ private void DecodeBlockBaseline(
{
ref short blockDataRef = ref Unsafe.As(ref block);
ref HuffmanScanBuffer buffer = ref this.scanBuffer;
- ref ZigZag zigzag = ref this.dctZigZag;
// DC
int t = buffer.DecodeHuffman(ref dcTable);
@@ -508,7 +501,7 @@ private void DecodeBlockBaseline(
{
i += r;
s = buffer.Receive(s);
- Unsafe.Add(ref blockDataRef, zigzag[i++]) = (short)s;
+ Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i++]) = (short)s;
}
else
{
@@ -562,7 +555,6 @@ private void DecodeBlockProgressiveAC(ref Block8x8 block, ref HuffmanTable acTab
}
ref HuffmanScanBuffer buffer = ref this.scanBuffer;
- ref ZigZag zigzag = ref this.dctZigZag;
int start = this.SpectralStart;
int end = this.SpectralEnd;
int low = this.SuccessiveLow;
@@ -578,7 +570,7 @@ private void DecodeBlockProgressiveAC(ref Block8x8 block, ref HuffmanTable acTab
if (s != 0)
{
s = buffer.Receive(s);
- Unsafe.Add(ref blockDataRef, zigzag[i]) = (short)(s << low);
+ Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[i]) = (short)(s << low);
}
else
{
@@ -608,7 +600,6 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman
{
// Refinement scan for these AC coefficients
ref HuffmanScanBuffer buffer = ref this.scanBuffer;
- ref ZigZag zigzag = ref this.dctZigZag;
int start = this.SpectralStart;
int end = this.SpectralEnd;
@@ -655,7 +646,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman
do
{
- ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+ ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
if (coef != 0)
{
buffer.CheckBits();
@@ -681,7 +672,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman
if ((s != 0) && (k < 64))
{
- Unsafe.Add(ref blockDataRef, zigzag[k]) = (short)s;
+ Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]) = (short)s;
}
}
}
@@ -690,7 +681,7 @@ private void DecodeBlockProgressiveACRefined(ref short blockDataRef, ref Huffman
{
for (; k <= end; k++)
{
- ref short coef = ref Unsafe.Add(ref blockDataRef, zigzag[k]);
+ ref short coef = ref Unsafe.Add(ref blockDataRef, ZigZag.ZigZagOrder[k]);
if (coef != 0)
{
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
index 391dac784f..0b80acc5dc 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/IRawJpegData.cs
@@ -22,7 +22,7 @@ internal interface IRawJpegData : IDisposable
IJpegComponent[] Components { get; }
///
- /// Gets the quantization tables, in zigzag order.
+ /// Gets the quantization tables, in natural order.
///
Block8x8F[] QuantizationTables { get; }
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
index 7cfbaddcc1..085cd4a291 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/JpegBlockPostProcessor.cs
@@ -19,14 +19,9 @@ internal struct JpegBlockPostProcessor
public Block8x8F SourceBlock;
///
- /// Temporal block 1 to store intermediate and/or final computation results.
+ /// Temporal block to store intermediate computation results.
///
- public Block8x8F WorkspaceBlock1;
-
- ///
- /// Temporal block 2 to store intermediate and/or final computation results.
- ///
- public Block8x8F WorkspaceBlock2;
+ public Block8x8F WorkspaceBlock;
///
/// The quantization table as .
@@ -46,12 +41,11 @@ internal struct JpegBlockPostProcessor
public JpegBlockPostProcessor(IRawJpegData decoder, IJpegComponent component)
{
int qtIndex = component.QuantizationTableIndex;
- this.DequantiazationTable = ZigZag.CreateDequantizationTable(ref decoder.QuantizationTables[qtIndex]);
+ this.DequantiazationTable = decoder.QuantizationTables[qtIndex];
this.subSamplingDivisors = component.SubSamplingDivisors;
this.SourceBlock = default;
- this.WorkspaceBlock1 = default;
- this.WorkspaceBlock2 = default;
+ this.WorkspaceBlock = default;
}
///
@@ -71,20 +65,20 @@ public void ProcessBlockColorsInto(
int destAreaStride,
float maximumValue)
{
- ref Block8x8F b = ref this.SourceBlock;
- b.LoadFrom(ref sourceBlock);
+ ref Block8x8F block = ref this.SourceBlock;
+ block.LoadFrom(ref sourceBlock);
// Dequantize:
- b.MultiplyInPlace(ref this.DequantiazationTable);
+ block.MultiplyInPlace(ref this.DequantiazationTable);
- FastFloatingPointDCT.TransformIDCT(ref b, ref this.WorkspaceBlock1, ref this.WorkspaceBlock2);
+ FastFloatingPointDCT.TransformIDCT(ref block, ref this.WorkspaceBlock);
// To conform better to libjpeg we actually NEED TO loose precision here.
// This is because they store blocks as Int16 between all the operations.
// To be "more accurate", we need to emulate this by rounding!
- this.WorkspaceBlock1.NormalizeColorsAndRoundInPlace(maximumValue);
+ block.NormalizeColorsAndRoundInPlace(maximumValue);
- this.WorkspaceBlock1.ScaledCopyTo(
+ block.ScaledCopyTo(
ref destAreaOrigin,
destAreaStride,
this.subSamplingDivisors.Width,
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
index 23bb01409c..e975b11fbb 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
@@ -39,6 +39,6 @@ internal abstract class SpectralConverter
/// The jpeg frame with the color space to convert to.
/// The raw JPEG data.
/// The color converter.
- public virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision);
+ protected virtual JpegColorConverter GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverter.GetConverter(jpegData.ColorSpace, frame.Precision);
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
index ec77bf87db..44b39dfd71 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
@@ -5,10 +5,25 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
///
/// A compiled look-up table representation of a huffmanSpec.
- /// Each value maps to a int32 of which the 24 most significant bits hold the
- /// codeword in bits and the 8 least significant bits hold the codeword size.
/// The maximum codeword size is 16 bits.
///
+ ///
+ ///
+ /// Each value maps to a int32 of which the 24 most significant bits hold the
+ /// codeword in bits and the 8 least significant bits hold the codeword size.
+ ///
+ ///
+ /// Code value occupies 24 most significant bits as integer value.
+ /// This value is shifted to the MSB position for performance reasons.
+ /// For example, decimal value 10 is stored like this:
+ ///
+ /// MSB LSB
+ /// 1010 0000 00000000 00000000 | 00000100
+ ///
+ /// This was done to eliminate extra binary shifts in the encoder.
+ /// While code length is represented as 8 bit integer value
+ ///
+ ///
internal readonly struct HuffmanLut
{
///
@@ -54,7 +69,7 @@ public HuffmanLut(HuffmanSpec spec)
int len = i + 1;
for (int j = 0; j < spec.Count[i]; j++)
{
- this.Values[spec.Values[k]] = len | (code << 8);
+ this.Values[spec.Values[k]] = len | (code << (32 - len));
code++;
k++;
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
index 4b74400cac..b3cdbf0a05 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -1,12 +1,11 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
+using System;
using System.IO;
+using System.Numerics;
using System.Runtime.CompilerServices;
-#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
-#endif
+using System.Runtime.InteropServices;
using System.Threading;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@@ -16,49 +15,118 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
internal class HuffmanScanEncoder
{
///
- /// Compiled huffman tree to encode given values.
+ /// Maximum number of bytes encoded jpeg 8x8 block can occupy.
+ /// It's highly unlikely for block to occupy this much space - it's a theoretical limit.
///
- /// Yields codewords by index consisting of [run length | bitsize].
- private HuffmanLut[] huffmanTables;
+ ///
+ /// Where 16 is maximum huffman code binary length according to itu
+ /// specs. 10 is maximum value binary length, value comes from discrete
+ /// cosine tranform with value range: [-1024..1023]. Block stores
+ /// 8x8 = 64 values thus multiplication by 64. Then divided by 8 to get
+ /// the number of bytes. This value is then multiplied by
+ /// for performance reasons.
+ ///
+ private const int MaxBytesPerBlock = (16 + 10) * 64 / 8 * MaxBytesPerBlockMultiplier;
///
- /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
+ /// Multiplier used within cache buffers size calculation.
///
///
- /// This is subject to change, 1024 seems to be the best value in terms of performance.
- /// expects it to be at least 8 (see comments in method body).
+ ///
+ /// Theoretically, bytes buffer can fit
+ /// exactly one minimal coding unit. In reality, coding blocks occupy much
+ /// less space than the theoretical maximum - this can be exploited.
+ /// If temporal buffer size is multiplied by at least 2, second half of
+ /// the resulting buffer will be used as an overflow 'guard' if next
+ /// block would occupy maximum number of bytes. While first half may fit
+ /// many blocks before needing to flush.
+ ///
+ ///
+ /// This is subject to change. This can be equal to 1 but recomended
+ /// value is 2 or even greater - futher benchmarking needed.
+ ///
///
- private const int EmitBufferSizeInBytes = 1024;
+ private const int MaxBytesPerBlockMultiplier = 2;
///
- /// A buffer for reducing the number of stream writes when emitting Huffman tables.
+ /// size multiplier.
///
- private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
+ ///
+ /// Jpeg specification requiers to insert 'stuff' bytes after each
+ /// 0xff byte value. Worst case scenarion is when all bytes are 0xff.
+ /// While it's highly unlikely (if not impossible) to get such
+ /// combination, it's theoretically possible so buffer size must be guarded.
+ ///
+ private const int OutputBufferLengthMultiplier = 2;
///
- /// Number of filled bytes in buffer
+ /// Compiled huffman tree to encode given values.
///
- private int emitLen = 0;
+ /// Yields codewords by index consisting of [run length | bitsize].
+ private HuffmanLut[] huffmanTables;
///
/// Emitted bits 'micro buffer' before being transferred to the .
///
- private int accumulatedBits;
+ private uint accumulatedBits;
+
+ ///
+ /// Buffer for temporal storage of huffman rle encoding bit data.
+ ///
+ ///
+ /// Encoding bits are assembled to 4 byte unsigned integers and then copied to this buffer.
+ /// This process does NOT include inserting stuff bytes.
+ ///
+ private readonly uint[] emitBuffer;
+
+ ///
+ /// Buffer for temporal storage which is then written to the output stream.
+ ///
+ ///
+ /// Encoding bits from are copied to this byte buffer including stuff bytes.
+ ///
+ private readonly byte[] streamWriteBuffer;
///
/// Number of jagged bits stored in
///
private int bitCount;
- private Block8x8F temporalBlock1;
- private Block8x8F temporalBlock2;
+ private int emitWriteIndex;
+
+ private Block8x8 tempBlock;
///
/// The output stream. All attempted writes after the first error become no-ops.
///
private readonly Stream target;
- public HuffmanScanEncoder(Stream outputStream) => this.target = outputStream;
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// Amount of encoded 8x8 blocks per single jpeg macroblock.
+ /// Output stream for saving encoded data.
+ public HuffmanScanEncoder(int blocksPerCodingUnit, Stream outputStream)
+ {
+ int emitBufferByteLength = MaxBytesPerBlock * blocksPerCodingUnit;
+ this.emitBuffer = new uint[emitBufferByteLength / sizeof(uint)];
+ this.emitWriteIndex = this.emitBuffer.Length;
+
+ this.streamWriteBuffer = new byte[emitBufferByteLength * OutputBufferLengthMultiplier];
+
+ this.target = outputStream;
+ }
+
+ ///
+ /// Gets a value indicating whether is full
+ /// and must be flushed using
+ /// before encoding next 8x8 coding block.
+ ///
+ private bool IsStreamFlushNeeded
+ {
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ get => this.emitWriteIndex < (uint)this.emitBuffer.Length / 2;
+ }
///
/// Encodes the image with no subsampling.
@@ -71,9 +139,10 @@ internal class HuffmanScanEncoder
public void Encode444(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel
{
- this.huffmanTables = HuffmanLut.TheHuffmanLut;
+ FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
+ FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable);
- var unzig = ZigZag.CreateUnzigTable();
+ this.huffmanTables = HuffmanLut.TheHuffmanLut;
// ReSharper disable once InconsistentNaming
int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
@@ -97,26 +166,28 @@ public void Encode444(Image pixels, ref Block8x8F luminanceQuant
QuantIndex.Luminance,
prevDCY,
ref pixelConverter.Y,
- ref luminanceQuantTable,
- ref unzig);
+ ref luminanceQuantTable);
prevDCCb = this.WriteBlock(
QuantIndex.Chrominance,
prevDCCb,
ref pixelConverter.Cb,
- ref chrominanceQuantTable,
- ref unzig);
+ ref chrominanceQuantTable);
prevDCCr = this.WriteBlock(
QuantIndex.Chrominance,
prevDCCr,
ref pixelConverter.Cr,
- ref chrominanceQuantTable,
- ref unzig);
+ ref chrominanceQuantTable);
+
+ if (this.IsStreamFlushNeeded)
+ {
+ this.FlushToStream();
+ }
}
}
- this.FlushInternalBuffer();
+ this.FlushRemainingBytes();
}
///
@@ -131,9 +202,10 @@ public void Encode444(Image pixels, ref Block8x8F luminanceQuant
public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel
{
- this.huffmanTables = HuffmanLut.TheHuffmanLut;
+ FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
+ FastFloatingPointDCT.AdjustToFDCT(ref chrominanceQuantTable);
- var unzig = ZigZag.CreateUnzigTable();
+ this.huffmanTables = HuffmanLut.TheHuffmanLut;
// ReSharper disable once InconsistentNaming
int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
@@ -158,34 +230,35 @@ public void Encode420(Image pixels, ref Block8x8F luminanceQuant
QuantIndex.Luminance,
prevDCY,
ref pixelConverter.YLeft,
- ref luminanceQuantTable,
- ref unzig);
+ ref luminanceQuantTable);
prevDCY = this.WriteBlock(
QuantIndex.Luminance,
prevDCY,
ref pixelConverter.YRight,
- ref luminanceQuantTable,
- ref unzig);
+ ref luminanceQuantTable);
}
prevDCCb = this.WriteBlock(
QuantIndex.Chrominance,
prevDCCb,
ref pixelConverter.Cb,
- ref chrominanceQuantTable,
- ref unzig);
+ ref chrominanceQuantTable);
prevDCCr = this.WriteBlock(
QuantIndex.Chrominance,
prevDCCr,
ref pixelConverter.Cr,
- ref chrominanceQuantTable,
- ref unzig);
+ ref chrominanceQuantTable);
+
+ if (this.IsStreamFlushNeeded)
+ {
+ this.FlushToStream();
+ }
}
}
- this.FlushInternalBuffer();
+ this.FlushRemainingBytes();
}
///
@@ -198,9 +271,9 @@ public void Encode420(Image pixels, ref Block8x8F luminanceQuant
public void EncodeGrayscale(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel
{
- this.huffmanTables = HuffmanLut.TheHuffmanLut;
+ FastFloatingPointDCT.AdjustToFDCT(ref luminanceQuantTable);
- var unzig = ZigZag.CreateUnzigTable();
+ this.huffmanTables = HuffmanLut.TheHuffmanLut;
// ReSharper disable once InconsistentNaming
int prevDCY = 0;
@@ -223,12 +296,16 @@ public void EncodeGrayscale(Image pixels, ref Block8x8F luminanc
QuantIndex.Luminance,
prevDCY,
ref pixelConverter.Y,
- ref luminanceQuantTable,
- ref unzig);
+ ref luminanceQuantTable);
+
+ if (this.IsStreamFlushNeeded)
+ {
+ this.FlushToStream();
+ }
}
}
- this.FlushInternalBuffer();
+ this.FlushRemainingBytes();
}
///
@@ -236,14 +313,14 @@ public void EncodeGrayscale(Image pixels, ref Block8x8F luminanc
///
/// The pixel format.
/// The pixel accessor providing access to the image pixels.
- /// Luminance quantization table provided by the callee.
+ /// Quantization table provided by the callee.
/// The token to monitor for cancellation.
- public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
+ public void EncodeRgb(Image pixels, ref Block8x8F quantTable, CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel
{
- this.huffmanTables = HuffmanLut.TheHuffmanLut;
+ FastFloatingPointDCT.AdjustToFDCT(ref quantTable);
- var unzig = ZigZag.CreateUnzigTable();
+ this.huffmanTables = HuffmanLut.TheHuffmanLut;
// ReSharper disable once InconsistentNaming
int prevDCR = 0, prevDCG = 0, prevDCB = 0;
@@ -267,26 +344,28 @@ public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuant
QuantIndex.Luminance,
prevDCR,
ref pixelConverter.R,
- ref luminanceQuantTable,
- ref unzig);
+ ref quantTable);
prevDCG = this.WriteBlock(
QuantIndex.Luminance,
prevDCG,
ref pixelConverter.G,
- ref luminanceQuantTable,
- ref unzig);
+ ref quantTable);
prevDCB = this.WriteBlock(
QuantIndex.Luminance,
prevDCB,
ref pixelConverter.B,
- ref luminanceQuantTable,
- ref unzig);
+ ref quantTable);
+
+ if (this.IsStreamFlushNeeded)
+ {
+ this.FlushToStream();
+ }
}
}
- this.FlushInternalBuffer();
+ this.FlushRemainingBytes();
}
///
@@ -296,47 +375,53 @@ public void EncodeRgb(Image pixels, ref Block8x8F luminanceQuant
///
/// The quantization table index.
/// The previous DC value.
- /// Source block
- /// Quantization table
- /// The 8x8 Unzig block.
+ /// Source block.
+ /// Quantization table.
/// The .
private int WriteBlock(
QuantIndex index,
int prevDC,
- ref Block8x8F src,
- ref Block8x8F quant,
- ref ZigZag unZig)
+ ref Block8x8F block,
+ ref Block8x8F quant)
{
- ref Block8x8F refTemp1 = ref this.temporalBlock1;
- ref Block8x8F refTemp2 = ref this.temporalBlock2;
+ ref Block8x8 spectralBlock = ref this.tempBlock;
- FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
+ // Shifting level from 0..255 to -128..127
+ block.AddInPlace(-128f);
- Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig);
+ // Discrete cosine transform
+ FastFloatingPointDCT.TransformFDCT(ref block);
+
+ // Quantization
+ Block8x8F.Quantize(ref block, ref spectralBlock, ref quant);
// Emit the DC delta.
- int dc = (int)refTemp2[0];
- this.EmitDirectCurrentTerm(this.huffmanTables[2 * (int)index].Values, dc - prevDC);
+ int dc = spectralBlock[0];
+ this.EmitHuffRLE(this.huffmanTables[2 * (int)index].Values, 0, dc - prevDC);
// Emit the AC components.
int[] acHuffTable = this.huffmanTables[(2 * (int)index) + 1].Values;
+ nint lastValuableIndex = spectralBlock.GetLastNonZeroIndex();
+
int runLength = 0;
- int lastValuableIndex = GetLastValuableElementIndex(ref refTemp2);
- for (int zig = 1; zig <= lastValuableIndex; zig++)
+ ref short blockRef = ref Unsafe.As(ref spectralBlock);
+ for (nint zig = 1; zig <= lastValuableIndex; zig++)
{
- int ac = (int)refTemp2[zig];
+ const int zeroRun1 = 1 << 4;
+ const int zeroRun16 = 16 << 4;
+ int ac = Unsafe.Add(ref blockRef, zig);
if (ac == 0)
{
- runLength++;
+ runLength += zeroRun1;
}
else
{
- while (runLength > 15)
+ while (runLength >= zeroRun16)
{
this.EmitHuff(acHuffTable, 0xf0);
- runLength -= 16;
+ runLength -= zeroRun16;
}
this.EmitHuffRLE(acHuffTable, runLength, ac);
@@ -356,100 +441,89 @@ private int WriteBlock(
}
///
- /// Emits the least significant count of bits to the stream write buffer.
- /// The precondition is bits
- ///
- /// < 1<<nBits && nBits <= 16
- ///
- /// .
+ /// Emits the most significant count of bits to the buffer.
///
- /// The packed bits.
- /// The number of bits
+ ///
+ ///
+ /// Supports up to 32 count of bits but, generally speaking, jpeg
+ /// standard assures that there won't be more than 16 bits per single
+ /// value.
+ ///
+ ///
+ /// Emitting algorithm uses 3 intermediate buffers for caching before
+ /// writing to the stream:
+ ///
+ /// -
+ /// uint32
+ ///
+ /// Bit buffer. Encoded spectral values can occupy up to 16 bits, bits
+ /// are assembled to whole bytes via this intermediate buffer.
+ ///
+ ///
+ /// -
+ /// uint32[]
+ ///
+ /// Assembled bytes from uint32 buffer are saved into this buffer.
+ /// uint32 buffer values are saved using indices from the last to the first.
+ /// As bytes are saved to the memory as 4-byte packages endianness matters:
+ /// Jpeg stream is big-endian, indexing buffer bytes from the last index to the
+ /// first eliminates all operations to extract separate bytes. This only works for
+ /// little-endian machines (there are no known examples of big-endian users atm).
+ /// For big-endians this approach is slower due to the separate byte extraction.
+ ///
+ ///
+ /// -
+ /// byte[]
+ ///
+ /// Byte buffer used only during method.
+ ///
+ ///
+ ///
+ ///
+ ///
+ /// Bits to emit, must be shifted to the left.
+ /// Bits count stored in the bits parameter.
[MethodImpl(InliningOptions.ShortMethod)]
- private void Emit(int bits, int count)
+ private void Emit(uint bits, int count)
{
+ this.accumulatedBits |= bits >> this.bitCount;
+
count += this.bitCount;
- bits <<= 32 - count;
- bits |= this.accumulatedBits;
- // Only write if more than 8 bits.
- if (count >= 8)
+ if (count >= 32)
{
- // Track length
- while (count >= 8)
- {
- byte b = (byte)(bits >> 24);
- this.emitBuffer[this.emitLen++] = b;
-
- // Adding stuff byte
- // This is because by JPEG standard scan data can contain JPEG markers (indicated by the 0xFF byte, followed by a non-zero byte)
- // Considering this every 0xFF byte must be followed by 0x00 padding byte to signal that this is not a marker
- if (b == byte.MaxValue)
- {
- this.emitBuffer[this.emitLen++] = byte.MinValue;
- }
-
- bits <<= 8;
- count -= 8;
- }
+ this.emitBuffer[--this.emitWriteIndex] = this.accumulatedBits;
+ this.accumulatedBits = bits << (32 - this.bitCount);
- // This can emit 4 times of:
- // 1 byte guaranteed
- // 1 extra byte.MinValue byte if previous one was byte.MaxValue
- // Thus writing (1 + 1) * 4 = 8 bytes max
- // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write
- if (this.emitLen > EmitBufferSizeInBytes - 8)
- {
- this.target.Write(this.emitBuffer, 0, this.emitLen);
- this.emitLen = 0;
- }
+ count -= 32;
}
- this.accumulatedBits = bits;
this.bitCount = count;
}
///
- /// Emits the given value with the given Huffman encoder.
+ /// Emits the given value with the given Huffman table.
///
- /// Compiled Huffman spec values.
- /// The value to encode.
+ /// Huffman table.
+ /// Value to encode.
[MethodImpl(InliningOptions.ShortMethod)]
private void EmitHuff(int[] table, int value)
{
int x = table[value];
- this.Emit(x >> 8, x & 0xff);
- }
-
- [MethodImpl(InliningOptions.ShortMethod)]
- private void EmitDirectCurrentTerm(int[] table, int value)
- {
- int a = value;
- int b = value;
- if (a < 0)
- {
- a = -value;
- b = value - 1;
- }
-
- int bt = GetHuffmanEncodingLength((uint)a);
-
- this.EmitHuff(table, bt);
- if (bt > 0)
- {
- this.Emit(b & ((1 << bt) - 1), bt);
- }
+ this.Emit((uint)x & 0xffff_ff00u, x & 0xff);
}
///
- /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
+ /// Emits given value via huffman rle encoding.
///
- /// Compiled Huffman spec values.
- /// The number of copies to encode.
- /// The value to encode.
+ /// Huffman table.
+ /// The number of preceding zeroes, preshifted by 4 to the left.
+ /// Value to encode.
[MethodImpl(InliningOptions.ShortMethod)]
private void EmitHuffRLE(int[] table, int runLength, int value)
{
+ DebugGuard.IsTrue((runLength & 0xf) == 0, $"{nameof(runLength)} parameter must be shifted to the left by 4 bits");
+
int a = value;
int b = value;
if (a < 0)
@@ -458,25 +532,18 @@ private void EmitHuffRLE(int[] table, int runLength, int value)
b = value - 1;
}
- int bt = GetHuffmanEncodingLength((uint)a);
+ int valueLen = GetHuffmanEncodingLength((uint)a);
- this.EmitHuff(table, (runLength << 4) | bt);
- this.Emit(b & ((1 << bt) - 1), bt);
- }
+ // Huffman prefix code
+ int huffPackage = table[runLength | valueLen];
+ int prefixLen = huffPackage & 0xff;
+ uint prefix = (uint)huffPackage & 0xffff_0000u;
- ///
- /// Writes remaining bytes from internal buffer to the target stream.
- ///
- /// Pads last byte with 1's if necessary
- private void FlushInternalBuffer()
- {
- // pad last byte with 1's
- int padBitsCount = 8 - (this.bitCount % 8);
- if (padBitsCount != 0)
- {
- this.Emit((1 << padBitsCount) - 1, padBitsCount);
- this.target.Write(this.emitBuffer, 0, this.emitLen);
- }
+ // Actual encoded value
+ uint encodedValue = (uint)b << (32 - valueLen);
+
+ // Doing two binary shifts to get rid of leading 1's in negative value case
+ this.Emit(prefix | (encodedValue >> prefixLen), prefixLen + valueLen);
}
///
@@ -498,7 +565,7 @@ internal static int GetHuffmanEncodingLength(uint value)
// Lzcnt would return 32 for input value of 0 - no need to check that with branching
// Fallback code if Lzcnt is not supported still use if-check
// But most modern CPUs support this instruction so this should not be a problem
- return 32 - System.Numerics.BitOperations.LeadingZeroCount(value);
+ return 32 - BitOperations.LeadingZeroCount(value);
#else
// Ideally:
// if 0 - return 0 in this case
@@ -515,65 +582,108 @@ internal static int GetHuffmanEncodingLength(uint value)
}
///
- /// Returns index of the last non-zero element in given mcu block.
- /// If all values of the mcu block are zero, this method might return different results depending on the runtime and hardware support.
- /// This is jpeg mcu specific code, mcu[0] stores a dc value which will be encoded outside of the loop.
- /// This method is guaranteed to return either -1 or 0 if all elements are zero.
+ /// General method for flushing cached spectral data bytes to
+ /// the ouput stream respecting stuff bytes.
///
///
- /// This is an internal operation supposed to be used only in class for jpeg encoding.
+ /// Bytes cached via are stored in 4-bytes blocks
+ /// which makes this method endianness dependent.
///
- /// Mcu block.
- /// Index of the last non-zero element.
[MethodImpl(InliningOptions.ShortMethod)]
- internal static int GetLastValuableElementIndex(ref Block8x8F mcu)
+ private void FlushToStream(int endIndex)
{
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Avx2.IsSupported)
- {
- const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
+ Span emitBytes = MemoryMarshal.AsBytes(this.emitBuffer.AsSpan());
- Vector256 zero8 = Vector256.Zero;
+ int writeIdx = 0;
+ int startIndex = emitBytes.Length - 1;
- ref Vector256 mcuStride = ref mcu.V0;
-
- for (int i = 7; i >= 0; i--)
+ // Some platforms may fail to eliminate this if-else branching
+ // Even if it happens - buffer is flushed in big packs,
+ // branching overhead shouldn't be noticeable
+ if (BitConverter.IsLittleEndian)
+ {
+ // For little endian case bytes are ordered and can be
+ // safely written to the stream with stuff bytes
+ // First byte is cached on the most significant index
+ // so we are going from the end of the array to its beginning:
+ // ... [ double word #1 ] [ double word #0 ]
+ // ... [idx3|idx2|idx1|idx0] [idx3|idx2|idx1|idx0]
+ for (int i = startIndex; i >= endIndex; i--)
{
- int areEqual = Avx2.MoveMask(Avx2.CompareEqual(Avx.ConvertToVector256Int32(Unsafe.Add(ref mcuStride, i)), zero8).AsByte());
+ byte value = emitBytes[i];
+ this.streamWriteBuffer[writeIdx++] = value;
- // we do not know for sure if this stride contain all non-zero elements or if it has some trailing zeros
- if (areEqual != equalityMask)
+ // Inserting stuff byte
+ if (value == 0xff)
{
- // last index in the stride, we go from the end to the start of the stride
- int startIndex = i * 8;
- int index = startIndex + 7;
- ref float elemRef = ref Unsafe.As(ref mcu);
- while (index >= startIndex && (int)Unsafe.Add(ref elemRef, index) == 0)
- {
- index--;
- }
-
- // this implementation will return -1 if all ac components are zero and dc are zero
- return index;
+ this.streamWriteBuffer[writeIdx++] = 0x00;
}
}
-
- return -1;
}
else
-#endif
{
- int index = Block8x8F.Size - 1;
- ref float elemRef = ref Unsafe.As(ref mcu);
-
- while (index > 0 && (int)Unsafe.Add(ref elemRef, index) == 0)
+ // For big endian case bytes are ordered in 4-byte packs
+ // which are ordered like bytes in the little endian case by in 4-byte packs:
+ // ... [ double word #1 ] [ double word #0 ]
+ // ... [idx0|idx1|idx2|idx3] [idx0|idx1|idx2|idx3]
+ // So we must write each 4-bytes in 'natural order'
+ for (int i = startIndex; i >= endIndex; i -= 4)
{
- index--;
- }
+ // This loop is caused by the nature of underlying byte buffer
+ // implementation and indeed causes performace by somewhat 5%
+ // compared to little endian scenario
+ // Even with this performance drop this cached buffer implementation
+ // is faster than individually writing bytes using binary shifts and binary and(s)
+ for (int j = i - 3; j <= i; j++)
+ {
+ byte value = emitBytes[j];
+ this.streamWriteBuffer[writeIdx++] = value;
- // this implementation will return 0 if all ac components and dc are zero
- return index;
+ // Inserting stuff byte
+ if (value == 0xff)
+ {
+ this.streamWriteBuffer[writeIdx++] = 0x00;
+ }
+ }
+ }
}
+
+ this.target.Write(this.streamWriteBuffer, 0, writeIdx);
+ }
+
+ ///
+ /// Flushes spectral data bytes after encoding all channel blocks
+ /// in a single jpeg macroblock using .
+ ///
+ ///
+ /// This must be called only if is true
+ /// only during the macroblocks encoding routine.
+ ///
+ private void FlushToStream()
+ {
+ this.FlushToStream(this.emitWriteIndex * 4);
+ this.emitWriteIndex = this.emitBuffer.Length;
+ }
+
+ ///
+ /// Flushes final cached bits to the stream padding 1's to
+ /// complement full bytes.
+ ///
+ ///
+ /// This must be called only once at the end of the encoding routine.
+ /// check is not needed.
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private void FlushRemainingBytes()
+ {
+ // Padding all 4 bytes with 1's while not corrupting initial bits stored in accumulatedBits
+ // And writing only valuable count of bytes count we want to write to the output stream
+ int valuableBytesCount = (int)Numerics.DivideCeil((uint)this.bitCount, 8);
+ uint packedBytes = this.accumulatedBits | (uint.MaxValue >> this.bitCount);
+ this.emitBuffer[--this.emitWriteIndex] = packedBytes;
+
+ // Flush cached bytes to the output stream with padding bits
+ this.FlushToStream((this.emitWriteIndex * 4) - 4 + valuableBytesCount);
}
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
new file mode 100644
index 0000000000..ab9462632f
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.Intrinsic.cs
@@ -0,0 +1,161 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+ internal static partial class FastFloatingPointDCT
+ {
+#pragma warning disable SA1310, SA1311, IDE1006 // naming rules violation warnings
+ private static readonly Vector256 mm256_F_0_7071 = Vector256.Create(0.707106781f);
+ private static readonly Vector256 mm256_F_0_3826 = Vector256.Create(0.382683433f);
+ private static readonly Vector256 mm256_F_0_5411 = Vector256.Create(0.541196100f);
+ private static readonly Vector256 mm256_F_1_3065 = Vector256.Create(1.306562965f);
+
+ private static readonly Vector256 mm256_F_1_1758 = Vector256.Create(1.175876f);
+ private static readonly Vector256 mm256_F_n1_9615 = Vector256.Create(-1.961570560f);
+ private static readonly Vector256 mm256_F_n0_3901 = Vector256.Create(-0.390180644f);
+ private static readonly Vector256 mm256_F_n0_8999 = Vector256.Create(-0.899976223f);
+ private static readonly Vector256 mm256_F_n2_5629 = Vector256.Create(-2.562915447f);
+ private static readonly Vector256 mm256_F_0_2986 = Vector256.Create(0.298631336f);
+ private static readonly Vector256 mm256_F_2_0531 = Vector256.Create(2.053119869f);
+ private static readonly Vector256 mm256_F_3_0727 = Vector256.Create(3.072711026f);
+ private static readonly Vector256 mm256_F_1_5013 = Vector256.Create(1.501321110f);
+ private static readonly Vector256 mm256_F_n1_8477 = Vector256.Create(-1.847759065f);
+ private static readonly Vector256 mm256_F_0_7653 = Vector256.Create(0.765366865f);
+#pragma warning restore SA1310, SA1311, IDE1006
+
+ ///
+ /// Apply floating point FDCT inplace using simd operations.
+ ///
+ /// Input matrix.
+ private static void ForwardTransform_Avx(ref Block8x8F block)
+ {
+ DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
+
+ // First pass - process rows
+ block.TransposeInplace();
+ FDCT8x8_Avx(ref block);
+
+ // Second pass - process columns
+ block.TransposeInplace();
+ FDCT8x8_Avx(ref block);
+ }
+
+ ///
+ /// Apply 1D floating point FDCT inplace using AVX operations on 8x8 matrix.
+ ///
+ ///
+ /// Requires Avx support.
+ ///
+ /// Input matrix.
+ public static void FDCT8x8_Avx(ref Block8x8F block)
+ {
+ DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
+
+ Vector256 tmp0 = Avx.Add(block.V0, block.V7);
+ Vector256 tmp7 = Avx.Subtract(block.V0, block.V7);
+ Vector256 tmp1 = Avx.Add(block.V1, block.V6);
+ Vector256 tmp6 = Avx.Subtract(block.V1, block.V6);
+ Vector256 tmp2 = Avx.Add(block.V2, block.V5);
+ Vector256 tmp5 = Avx.Subtract(block.V2, block.V5);
+ Vector256 tmp3 = Avx.Add(block.V3, block.V4);
+ Vector256 tmp4 = Avx.Subtract(block.V3, block.V4);
+
+ // Even part
+ Vector256 tmp10 = Avx.Add(tmp0, tmp3);
+ Vector256 tmp13 = Avx.Subtract(tmp0, tmp3);
+ Vector256 tmp11 = Avx.Add(tmp1, tmp2);
+ Vector256 tmp12 = Avx.Subtract(tmp1, tmp2);
+
+ block.V0 = Avx.Add(tmp10, tmp11);
+ block.V4 = Avx.Subtract(tmp10, tmp11);
+
+ Vector256 z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
+ block.V2 = Avx.Add(tmp13, z1);
+ block.V6 = Avx.Subtract(tmp13, z1);
+
+ // Odd part
+ tmp10 = Avx.Add(tmp4, tmp5);
+ tmp11 = Avx.Add(tmp5, tmp6);
+ tmp12 = Avx.Add(tmp6, tmp7);
+
+ Vector256 z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
+ Vector256 z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
+ Vector256 z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
+ Vector256 z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
+
+ Vector256 z11 = Avx.Add(tmp7, z3);
+ Vector256 z13 = Avx.Subtract(tmp7, z3);
+
+ block.V5 = Avx.Add(z13, z2);
+ block.V3 = Avx.Subtract(z13, z2);
+ block.V1 = Avx.Add(z11, z4);
+ block.V7 = Avx.Subtract(z11, z4);
+ }
+
+ ///
+ /// Combined operation of and
+ /// using AVX commands.
+ ///
+ /// Source
+ /// Destination
+ public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+ {
+ Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
+ Vector256 my1 = s.V1;
+ Vector256 my7 = s.V7;
+ Vector256 mz0 = Avx.Add(my1, my7);
+
+ Vector256 my3 = s.V3;
+ Vector256 mz2 = Avx.Add(my3, my7);
+ Vector256 my5 = s.V5;
+ Vector256 mz1 = Avx.Add(my3, my5);
+ Vector256 mz3 = Avx.Add(my1, my5);
+
+ Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), mm256_F_1_1758);
+
+ mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, mm256_F_n1_9615);
+ mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, mm256_F_n0_3901);
+ mz0 = Avx.Multiply(mz0, mm256_F_n0_8999);
+ mz1 = Avx.Multiply(mz1, mm256_F_n2_5629);
+
+ Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, mm256_F_0_2986), mz2);
+ Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, mm256_F_2_0531), mz3);
+ Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, mm256_F_3_0727), mz2);
+ Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, mm256_F_1_5013), mz3);
+
+ Vector256 my2 = s.V2;
+ Vector256 my6 = s.V6;
+ mz4 = Avx.Multiply(Avx.Add(my2, my6), mm256_F_0_5411);
+ Vector256 my0 = s.V0;
+ Vector256 my4 = s.V4;
+ mz0 = Avx.Add(my0, my4);
+ mz1 = Avx.Subtract(my0, my4);
+ mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, mm256_F_n1_8477);
+ mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, mm256_F_0_7653);
+
+ my0 = Avx.Add(mz0, mz3);
+ my3 = Avx.Subtract(mz0, mz3);
+ my1 = Avx.Add(mz1, mz2);
+ my2 = Avx.Subtract(mz1, mz2);
+
+ d.V0 = Avx.Add(my0, mb0);
+ d.V7 = Avx.Subtract(my0, mb0);
+ d.V1 = Avx.Add(my1, mb1);
+ d.V6 = Avx.Subtract(my1, mb1);
+ d.V2 = Avx.Add(my2, mb2);
+ d.V5 = Avx.Subtract(my2, mb2);
+ d.V3 = Avx.Add(my3, mb3);
+ d.V4 = Avx.Subtract(my3, mb3);
+ }
+ }
+}
+#endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index 0f569b5da1..6963c36369 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,11 +1,9 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
-using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
#if SUPPORTS_RUNTIME_INTRINSICS
-using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
@@ -19,283 +17,304 @@ internal static partial class FastFloatingPointDCT
{
#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
private const float C_1_175876 = 1.175875602f;
-
private const float C_1_961571 = -1.961570560f;
-
private const float C_0_390181 = -0.390180644f;
-
private const float C_0_899976 = -0.899976223f;
-
private const float C_2_562915 = -2.562915447f;
-
private const float C_0_298631 = 0.298631336f;
-
private const float C_2_053120 = 2.053119869f;
-
private const float C_3_072711 = 3.072711026f;
-
private const float C_1_501321 = 1.501321110f;
-
private const float C_0_541196 = 0.541196100f;
-
private const float C_1_847759 = -1.847759065f;
-
private const float C_0_765367 = 0.765366865f;
private const float C_0_125 = 0.1250f;
-#if SUPPORTS_RUNTIME_INTRINSICS
- private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f);
- private static readonly Vector256 C_V_1_3065 = Vector256.Create(1.306563f);
- private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f);
- private static readonly Vector256 C_V_0_7856 = Vector256.Create(0.785695f);
- private static readonly Vector256 C_V_1_3870 = Vector256.Create(1.387040f);
- private static readonly Vector256 C_V_0_2758 = Vector256.Create(0.275899f);
-
- private static readonly Vector256 C_V_n1_9615 = Vector256.Create(-1.961570560f);
- private static readonly Vector256 C_V_n0_3901 = Vector256.Create(-0.390180644f);
- private static readonly Vector256 C_V_n0_8999 = Vector256.Create(-0.899976223f);
- private static readonly Vector256 C_V_n2_5629 = Vector256.Create(-2.562915447f);
- private static readonly Vector256 C_V_0_2986 = Vector256.Create(0.298631336f);
- private static readonly Vector256 C_V_2_0531 = Vector256.Create(2.053119869f);
- private static readonly Vector256 C_V_3_0727 = Vector256.Create(3.072711026f);
- private static readonly Vector256 C_V_1_5013 = Vector256.Create(1.501321110f);
- private static readonly Vector256 C_V_n1_8477 = Vector256.Create(-1.847759065f);
- private static readonly Vector256 C_V_0_7653 = Vector256.Create(0.765366865f);
-
- private static readonly Vector256 C_V_InvSqrt2 = Vector256.Create(0.707107f);
-#endif
+#pragma warning disable SA1311, IDE1006 // naming rules violation warnings
+ private static readonly Vector4 mm128_F_0_7071 = new Vector4(0.707106781f);
+ private static readonly Vector4 mm128_F_0_3826 = new Vector4(0.382683433f);
+ private static readonly Vector4 mm128_F_0_5411 = new Vector4(0.541196100f);
+ private static readonly Vector4 mm128_F_1_3065 = new Vector4(1.306562965f);
+#pragma warning restore SA1311, IDE1006
+
#pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
- private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
///
- /// Original:
- ///
- /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15
- ///
+ /// Gets reciprocal coefficients for jpeg quantization tables calculation.
///
- /// Source
- /// Destination
- public static void FDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+ ///
+ ///
+ /// Current FDCT implementation expects its results to be multiplied by
+ /// a reciprocal quantization table. To get 8x8 reciprocal block values in this
+ /// table must be divided by quantization table values scaled with quality settings.
+ ///
+ ///
+ /// These values were calculates with this formula:
+ ///
+ /// value[row * 8 + col] = scalefactor[row] * scalefactor[col] * 8;
+ ///
+ /// Where:
+ ///
+ /// scalefactor[0] = 1
+ ///
+ ///
+ /// scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7
+ ///
+ /// Values are also scaled by 8 so DCT code won't do extra division/multiplication.
+ ///
+ ///
+ internal static readonly float[] DctReciprocalAdjustmentCoefficients = new float[]
{
- Vector4 c0 = s.V0L;
- Vector4 c1 = s.V7L;
- Vector4 t0 = c0 + c1;
- Vector4 t7 = c0 - c1;
-
- c1 = s.V6L;
- c0 = s.V1L;
- Vector4 t1 = c0 + c1;
- Vector4 t6 = c0 - c1;
-
- c1 = s.V5L;
- c0 = s.V2L;
- Vector4 t2 = c0 + c1;
- Vector4 t5 = c0 - c1;
-
- c0 = s.V3L;
- c1 = s.V4L;
- Vector4 t3 = c0 + c1;
- Vector4 t4 = c0 - c1;
-
- c0 = t0 + t3;
- Vector4 c3 = t0 - t3;
- c1 = t1 + t2;
- Vector4 c2 = t1 - t2;
-
- d.V0L = c0 + c1;
- d.V4L = c0 - c1;
-
- float w0 = 0.541196f;
- float w1 = 1.306563f;
-
- d.V2L = (w0 * c2) + (w1 * c3);
- d.V6L = (w0 * c3) - (w1 * c2);
-
- w0 = 1.175876f;
- w1 = 0.785695f;
- c3 = (w0 * t4) + (w1 * t7);
- c0 = (w0 * t7) - (w1 * t4);
-
- w0 = 1.387040f;
- w1 = 0.275899f;
- c2 = (w0 * t5) + (w1 * t6);
- c1 = (w0 * t6) - (w1 * t5);
-
- d.V3L = c0 - c2;
- d.V5L = c3 - c1;
-
- float invsqrt2 = 0.707107f;
- c0 = (c0 + c2) * invsqrt2;
- c3 = (c3 + c1) * invsqrt2;
-
- d.V1L = c0 + c3;
- d.V7L = c0 - c3;
- }
+ 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+ 0.09011998f, 0.064972885f, 0.068974845f, 0.07664074f, 0.09011998f, 0.11470097f, 0.16652f, 0.32664075f,
+ 0.09567086f, 0.068974845f, 0.07322331f, 0.081361376f, 0.09567086f, 0.121765904f, 0.17677669f, 0.34675997f,
+ 0.10630376f, 0.07664074f, 0.081361376f, 0.09040392f, 0.10630376f, 0.13529903f, 0.19642374f, 0.38529903f,
+ 0.125f, 0.09011998f, 0.09567086f, 0.10630376f, 0.125f, 0.15909483f, 0.23096988f, 0.45306373f,
+ 0.15909483f, 0.11470097f, 0.121765904f, 0.13529903f, 0.15909483f, 0.2024893f, 0.2939689f, 0.5766407f,
+ 0.23096988f, 0.16652f, 0.17677669f, 0.19642374f, 0.23096988f, 0.2939689f, 0.4267767f, 0.8371526f,
+ 0.45306373f, 0.32664075f, 0.34675997f, 0.38529903f, 0.45306373f, 0.5766407f, 0.8371526f, 1.642134f,
+ };
///
- /// Original:
- ///
- /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L15
- ///
+ /// Adjusts given quantization table to be complient with FDCT implementation.
///
- /// Source
- /// Destination
- public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+ ///
+ /// See docs for explanation.
+ ///
+ /// Quantization table to adjust.
+ public static void AdjustToFDCT(ref Block8x8F quantizationtable)
{
- Vector4 c0 = s.V0R;
- Vector4 c1 = s.V7R;
- Vector4 t0 = c0 + c1;
- Vector4 t7 = c0 - c1;
-
- c1 = s.V6R;
- c0 = s.V1R;
- Vector4 t1 = c0 + c1;
- Vector4 t6 = c0 - c1;
-
- c1 = s.V5R;
- c0 = s.V2R;
- Vector4 t2 = c0 + c1;
- Vector4 t5 = c0 - c1;
-
- c0 = s.V3R;
- c1 = s.V4R;
- Vector4 t3 = c0 + c1;
- Vector4 t4 = c0 - c1;
-
- c0 = t0 + t3;
- Vector4 c3 = t0 - t3;
- c1 = t1 + t2;
- Vector4 c2 = t1 - t2;
-
- d.V0R = c0 + c1;
- d.V4R = c0 - c1;
-
- float w0 = 0.541196f;
- float w1 = 1.306563f;
-
- d.V2R = (w0 * c2) + (w1 * c3);
- d.V6R = (w0 * c3) - (w1 * c2);
-
- w0 = 1.175876f;
- w1 = 0.785695f;
- c3 = (w0 * t4) + (w1 * t7);
- c0 = (w0 * t7) - (w1 * t4);
-
- w0 = 1.387040f;
- w1 = 0.275899f;
- c2 = (w0 * t5) + (w1 * t6);
- c1 = (w0 * t6) - (w1 * t5);
-
- d.V3R = c0 - c2;
- d.V5R = c3 - c1;
-
- c0 = (c0 + c2) * InvSqrt2;
- c3 = (c3 + c1) * InvSqrt2;
-
- d.V1R = c0 + c3;
- d.V7R = c0 - c3;
+ for (int i = 0; i < Block8x8F.Size; i++)
+ {
+ quantizationtable[i] = DctReciprocalAdjustmentCoefficients[i] / quantizationtable[i];
+ }
}
///
- /// Combined operation of and
- /// using AVX commands.
+ /// Apply 2D floating point FDCT inplace.
///
- /// Source
- /// Destination
- public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+ /// Input matrix.
+ public static void TransformFDCT(ref Block8x8F block)
{
#if SUPPORTS_RUNTIME_INTRINSICS
- Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
- Vector256 t0 = Avx.Add(s.V0, s.V7);
- Vector256 t7 = Avx.Subtract(s.V0, s.V7);
- Vector256 t1 = Avx.Add(s.V1, s.V6);
- Vector256 t6 = Avx.Subtract(s.V1, s.V6);
- Vector256 t2 = Avx.Add(s.V2, s.V5);
- Vector256 t5 = Avx.Subtract(s.V2, s.V5);
- Vector256 t3 = Avx.Add(s.V3, s.V4);
- Vector256 t4 = Avx.Subtract(s.V3, s.V4);
-
- Vector256 c0 = Avx.Add(t0, t3);
- Vector256 c1 = Avx.Add(t1, t2);
-
- // 0 4
- d.V0 = Avx.Add(c0, c1);
- d.V4 = Avx.Subtract(c0, c1);
-
- Vector256 c3 = Avx.Subtract(t0, t3);
- Vector256 c2 = Avx.Subtract(t1, t2);
-
- // 2 6
- d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
- d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411);
-
- c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
- c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758);
-
- c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
- c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870);
-
- // 3 5
- d.V3 = Avx.Subtract(c0, c2);
- d.V5 = Avx.Subtract(c3, c1);
-
- c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2);
- c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2);
-
- // 1 7
- d.V1 = Avx.Add(c0, c3);
- d.V7 = Avx.Subtract(c0, c3);
+ if (Avx.IsSupported)
+ {
+ ForwardTransform_Avx(ref block);
+ }
+ else
#endif
+ if (Vector.IsHardwareAccelerated)
+ {
+ ForwardTransform_Vector4(ref block);
+ }
+ else
+ {
+ ForwardTransform_Scalar(ref block);
+ }
}
///
- /// Performs 8x8 matrix Forward Discrete Cosine Transform
+ /// Apply 2D floating point FDCT inplace using scalar operations.
///
- /// Source
- /// Destination
- public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d)
+ ///
+ /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
+ ///
+ /// Input matrix.
+ private static void ForwardTransform_Scalar(ref Block8x8F block)
{
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Avx.IsSupported)
+ const int dctSize = 8;
+
+ float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ float tmp10, tmp11, tmp12, tmp13;
+ float z1, z2, z3, z4, z5, z11, z13;
+
+ // First pass - process rows
+ ref float dataRef = ref Unsafe.As(ref block);
+ for (int ctr = 7; ctr >= 0; ctr--)
{
- FDCT8x8_Avx(ref s, ref d);
+ tmp0 = Unsafe.Add(ref dataRef, 0) + Unsafe.Add(ref dataRef, 7);
+ tmp7 = Unsafe.Add(ref dataRef, 0) - Unsafe.Add(ref dataRef, 7);
+ tmp1 = Unsafe.Add(ref dataRef, 1) + Unsafe.Add(ref dataRef, 6);
+ tmp6 = Unsafe.Add(ref dataRef, 1) - Unsafe.Add(ref dataRef, 6);
+ tmp2 = Unsafe.Add(ref dataRef, 2) + Unsafe.Add(ref dataRef, 5);
+ tmp5 = Unsafe.Add(ref dataRef, 2) - Unsafe.Add(ref dataRef, 5);
+ tmp3 = Unsafe.Add(ref dataRef, 3) + Unsafe.Add(ref dataRef, 4);
+ tmp4 = Unsafe.Add(ref dataRef, 3) - Unsafe.Add(ref dataRef, 4);
+
+ // Even part
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ Unsafe.Add(ref dataRef, 0) = tmp10 + tmp11;
+ Unsafe.Add(ref dataRef, 4) = tmp10 - tmp11;
+
+ z1 = (tmp12 + tmp13) * 0.707106781f;
+ Unsafe.Add(ref dataRef, 2) = tmp13 + z1;
+ Unsafe.Add(ref dataRef, 6) = tmp13 - z1;
+
+ // Odd part
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ z5 = (tmp10 - tmp12) * 0.382683433f;
+ z2 = (0.541196100f * tmp10) + z5;
+ z4 = (1.306562965f * tmp12) + z5;
+ z3 = tmp11 * 0.707106781f;
+
+ z11 = tmp7 + z3;
+ z13 = tmp7 - z3;
+
+ Unsafe.Add(ref dataRef, 5) = z13 + z2;
+ Unsafe.Add(ref dataRef, 3) = z13 - z2;
+ Unsafe.Add(ref dataRef, 1) = z11 + z4;
+ Unsafe.Add(ref dataRef, 7) = z11 - z4;
+
+ dataRef = ref Unsafe.Add(ref dataRef, dctSize);
}
- else
-#endif
+
+ // Second pass - process columns
+ dataRef = ref Unsafe.As(ref block);
+ for (int ctr = 7; ctr >= 0; ctr--)
{
- FDCT8x4_LeftPart(ref s, ref d);
- FDCT8x4_RightPart(ref s, ref d);
+ tmp0 = Unsafe.Add(ref dataRef, dctSize * 0) + Unsafe.Add(ref dataRef, dctSize * 7);
+ tmp7 = Unsafe.Add(ref dataRef, dctSize * 0) - Unsafe.Add(ref dataRef, dctSize * 7);
+ tmp1 = Unsafe.Add(ref dataRef, dctSize * 1) + Unsafe.Add(ref dataRef, dctSize * 6);
+ tmp6 = Unsafe.Add(ref dataRef, dctSize * 1) - Unsafe.Add(ref dataRef, dctSize * 6);
+ tmp2 = Unsafe.Add(ref dataRef, dctSize * 2) + Unsafe.Add(ref dataRef, dctSize * 5);
+ tmp5 = Unsafe.Add(ref dataRef, dctSize * 2) - Unsafe.Add(ref dataRef, dctSize * 5);
+ tmp3 = Unsafe.Add(ref dataRef, dctSize * 3) + Unsafe.Add(ref dataRef, dctSize * 4);
+ tmp4 = Unsafe.Add(ref dataRef, dctSize * 3) - Unsafe.Add(ref dataRef, dctSize * 4);
+
+ // Even part
+ tmp10 = tmp0 + tmp3;
+ tmp13 = tmp0 - tmp3;
+ tmp11 = tmp1 + tmp2;
+ tmp12 = tmp1 - tmp2;
+
+ Unsafe.Add(ref dataRef, dctSize * 0) = tmp10 + tmp11;
+ Unsafe.Add(ref dataRef, dctSize * 4) = tmp10 - tmp11;
+
+ z1 = (tmp12 + tmp13) * 0.707106781f;
+ Unsafe.Add(ref dataRef, dctSize * 2) = tmp13 + z1;
+ Unsafe.Add(ref dataRef, dctSize * 6) = tmp13 - z1;
+
+ // Odd part
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ z5 = (tmp10 - tmp12) * 0.382683433f;
+ z2 = (0.541196100f * tmp10) + z5;
+ z4 = (1.306562965f * tmp12) + z5;
+ z3 = tmp11 * 0.707106781f;
+
+ z11 = tmp7 + z3;
+ z13 = tmp7 - z3;
+
+ Unsafe.Add(ref dataRef, dctSize * 5) = z13 + z2;
+ Unsafe.Add(ref dataRef, dctSize * 3) = z13 - z2;
+ Unsafe.Add(ref dataRef, dctSize * 1) = z11 + z4;
+ Unsafe.Add(ref dataRef, dctSize * 7) = z11 - z4;
+
+ dataRef = ref Unsafe.Add(ref dataRef, 1);
}
}
///
- /// Apply floating point FDCT from src into dest
+ /// Apply floating point FDCT inplace using API.
///
- /// Source
- /// Destination
- /// Temporary block provided by the caller for optimization
- /// If true, a constant -128.0 offset is applied for all values before FDCT
- public static void TransformFDCT(
- ref Block8x8F src,
- ref Block8x8F dest,
- ref Block8x8F temp,
- bool offsetSourceByNeg128 = true)
+ ///
+ /// This implementation must be called only if hardware supports 4
+ /// floating point numbers vector. Otherwise explicit scalar
+ /// implementation is faster
+ /// because it does not rely on matrix transposition.
+ ///
+ /// Input matrix.
+ private static void ForwardTransform_Vector4(ref Block8x8F block)
{
- src.TransposeInto(ref temp);
- if (offsetSourceByNeg128)
- {
- temp.AddInPlace(-128F);
- }
+ DebugGuard.IsTrue(Vector.IsHardwareAccelerated, "Scalar implementation should be called for non-accelerated hardware.");
- FDCT8x8(ref temp, ref dest);
+ // First pass - process rows
+ block.TransposeInplace();
+ FDCT8x4_Vector4(ref block.V0L);
+ FDCT8x4_Vector4(ref block.V0R);
- dest.TransposeInto(ref temp);
+ // Second pass - process columns
+ block.TransposeInplace();
+ FDCT8x4_Vector4(ref block.V0L);
+ FDCT8x4_Vector4(ref block.V0R);
+ }
- FDCT8x8(ref temp, ref dest);
+ ///
+ /// Apply 1D floating point FDCT inplace on 8x4 part of 8x8 matrix.
+ ///
+ ///
+ /// Implemented using Vector4 API operations for either scalar or sse hardware implementation.
+ /// Must be called on both 8x4 matrix parts for the full FDCT transform.
+ ///
+ /// Input reference to the first
+ private static void FDCT8x4_Vector4(ref Vector4 blockRef)
+ {
+ Vector4 tmp0 = Unsafe.Add(ref blockRef, 0) + Unsafe.Add(ref blockRef, 14);
+ Vector4 tmp7 = Unsafe.Add(ref blockRef, 0) - Unsafe.Add(ref blockRef, 14);
+ Vector4 tmp1 = Unsafe.Add(ref blockRef, 2) + Unsafe.Add(ref blockRef, 12);
+ Vector4 tmp6 = Unsafe.Add(ref blockRef, 2) - Unsafe.Add(ref blockRef, 12);
+ Vector4 tmp2 = Unsafe.Add(ref blockRef, 4) + Unsafe.Add(ref blockRef, 10);
+ Vector4 tmp5 = Unsafe.Add(ref blockRef, 4) - Unsafe.Add(ref blockRef, 10);
+ Vector4 tmp3 = Unsafe.Add(ref blockRef, 6) + Unsafe.Add(ref blockRef, 8);
+ Vector4 tmp4 = Unsafe.Add(ref blockRef, 6) - Unsafe.Add(ref blockRef, 8);
+
+ // Even part
+ Vector4 tmp10 = tmp0 + tmp3;
+ Vector4 tmp13 = tmp0 - tmp3;
+ Vector4 tmp11 = tmp1 + tmp2;
+ Vector4 tmp12 = tmp1 - tmp2;
+
+ Unsafe.Add(ref blockRef, 0) = tmp10 + tmp11;
+ Unsafe.Add(ref blockRef, 8) = tmp10 - tmp11;
+
+ Vector4 z1 = (tmp12 + tmp13) * mm128_F_0_7071;
+ Unsafe.Add(ref blockRef, 4) = tmp13 + z1;
+ Unsafe.Add(ref blockRef, 12) = tmp13 - z1;
+
+ // Odd part
+ tmp10 = tmp4 + tmp5;
+ tmp11 = tmp5 + tmp6;
+ tmp12 = tmp6 + tmp7;
+
+ Vector4 z5 = (tmp10 - tmp12) * mm128_F_0_3826;
+ Vector4 z2 = (mm128_F_0_5411 * tmp10) + z5;
+ Vector4 z4 = (mm128_F_1_3065 * tmp12) + z5;
+ Vector4 z3 = tmp11 * mm128_F_0_7071;
+
+ Vector4 z11 = tmp7 + z3;
+ Vector4 z13 = tmp7 - z3;
+
+ Unsafe.Add(ref blockRef, 10) = z13 + z2;
+ Unsafe.Add(ref blockRef, 6) = z13 - z2;
+ Unsafe.Add(ref blockRef, 2) = z11 + z4;
+ Unsafe.Add(ref blockRef, 14) = z11 - z4;
+ }
- dest.MultiplyInPlace(C_0_125);
+ ///
+ /// Apply floating point IDCT inplace.
+ /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239.
+ ///
+ /// Input matrix.
+ /// Matrix to store temporal results.
+ public static void TransformIDCT(ref Block8x8F block, ref Block8x8F temp)
+ {
+ block.TransposeInplace();
+ IDCT8x8(ref block, ref temp);
+ temp.TransposeInplace();
+ IDCT8x8(ref temp, ref block);
+
+ // TODO: This can be fused into quantization table step
+ block.MultiplyInPlace(C_0_125);
}
///
@@ -303,7 +322,7 @@ public static void TransformFDCT(
///
/// Source
/// Destination
- public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+ private static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
@@ -432,83 +451,5 @@ public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
d.V3R = my3 + mb3;
d.V4R = my3 - mb3;
}
-
- ///
- /// Combined operation of and
- /// using AVX commands.
- ///
- /// Source
- /// Destination
- public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
-
- Vector256 my1 = s.V1;
- Vector256 my7 = s.V7;
- Vector256 mz0 = Avx.Add(my1, my7);
-
- Vector256 my3 = s.V3;
- Vector256 mz2 = Avx.Add(my3, my7);
- Vector256 my5 = s.V5;
- Vector256 mz1 = Avx.Add(my3, my5);
- Vector256 mz3 = Avx.Add(my1, my5);
-
- Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
-
- mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
- mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
- mz0 = Avx.Multiply(mz0, C_V_n0_8999);
- mz1 = Avx.Multiply(mz1, C_V_n2_5629);
-
- Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
- Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
- Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
- Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
-
- Vector256 my2 = s.V2;
- Vector256 my6 = s.V6;
- mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
- Vector256 my0 = s.V0;
- Vector256 my4 = s.V4;
- mz0 = Avx.Add(my0, my4);
- mz1 = Avx.Subtract(my0, my4);
- mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
- mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
-
- my0 = Avx.Add(mz0, mz3);
- my3 = Avx.Subtract(mz0, mz3);
- my1 = Avx.Add(mz1, mz2);
- my2 = Avx.Subtract(mz1, mz2);
-
- d.V0 = Avx.Add(my0, mb0);
- d.V7 = Avx.Subtract(my0, mb0);
- d.V1 = Avx.Add(my1, mb1);
- d.V6 = Avx.Subtract(my1, mb1);
- d.V2 = Avx.Add(my2, mb2);
- d.V5 = Avx.Subtract(my2, mb2);
- d.V3 = Avx.Add(my3, mb3);
- d.V4 = Avx.Subtract(my3, mb3);
-#endif
- }
-
- ///
- /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
- /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
- ///
- /// Source
- /// Destination
- /// Temporary block provided by the caller
- public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
- {
- src.TransposeInto(ref temp);
-
- IDCT8x8(ref temp, ref dest);
- dest.TransposeInto(ref temp);
- IDCT8x8(ref temp, ref dest);
-
- // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
- dest.MultiplyInPlace(C_0_125);
- }
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
index 2ff56c63b9..eab5e6a082 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Quantization.cs
@@ -39,53 +39,59 @@ internal static class Quantization
public const int QualityEstimationConfidenceUpperThreshold = 98;
///
- /// Gets the unscaled luminance quantization table in zig-zag order. Each
- /// encoder copies and scales the tables according to its quality parameter.
- /// The values are derived from ITU section K.1 after converting from natural to
- /// zig-zag order.
+ /// Gets unscaled luminance quantization table.
///
+ ///
+ /// The values are derived from ITU section K.1.
+ ///
// The C# compiler emits this as a compile-time constant embedded in the PE file.
// This is effectively compiled down to: return new ReadOnlySpan(&data, length)
// More details can be found: https://github.com/dotnet/roslyn/pull/24621
- public static ReadOnlySpan UnscaledQuant_Luminance => new byte[]
+ public static ReadOnlySpan LuminanceTable => new byte[]
{
- 16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
- 40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
- 57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
- 109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
- 100, 120, 92, 101, 103, 99,
+ 16, 11, 10, 16, 24, 40, 51, 61,
+ 12, 12, 14, 19, 26, 58, 60, 55,
+ 14, 13, 16, 24, 40, 57, 69, 56,
+ 14, 17, 22, 29, 51, 87, 80, 62,
+ 18, 22, 37, 56, 68, 109, 103, 77,
+ 24, 35, 55, 64, 81, 104, 113, 92,
+ 49, 64, 78, 87, 103, 121, 120, 101,
+ 72, 92, 95, 98, 112, 100, 103, 99,
};
///
- /// Gets the unscaled chrominance quantization table in zig-zag order. Each
- /// encoder copies and scales the tables according to its quality parameter.
- /// The values are derived from ITU section K.1 after converting from natural to
- /// zig-zag order.
+ /// Gets unscaled chrominance quantization table.
///
+ ///
+ /// The values are derived from ITU section K.1.
+ ///
// The C# compiler emits this as a compile-time constant embedded in the PE file.
// This is effectively compiled down to: return new ReadOnlySpan(&data, length)
// More details can be found: https://github.com/dotnet/roslyn/pull/24621
- public static ReadOnlySpan UnscaledQuant_Chrominance => new byte[]
+ public static ReadOnlySpan ChrominanceTable => new byte[]
{
- 17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
- 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
- 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
- 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+ 17, 18, 24, 47, 99, 99, 99, 99,
+ 18, 21, 26, 66, 99, 99, 99, 99,
+ 24, 26, 56, 99, 99, 99, 99, 99,
+ 47, 66, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
+ 99, 99, 99, 99, 99, 99, 99, 99,
99, 99, 99, 99, 99, 99, 99, 99,
};
/// Ported from JPEGsnoop:
/// https://github.com/ImpulseAdventure/JPEGsnoop/blob/9732ee0961f100eb69bbff4a0c47438d5997abee/source/JfifDecode.cpp#L4570-L4694
///
- /// Estimates jpeg quality based on quantization table in zig-zag order.
+ /// Estimates jpeg quality based on standard quantization table.
///
///
- /// This technically can be used with any given table but internal decoder code uses ITU spec tables:
- /// and .
+ /// Technically, this can be used with any given table but internal decoder code uses ITU spec tables:
+ /// and .
///
/// Input quantization table.
- /// Quantization to estimate against.
- /// Estimated quality
+ /// Natural order quantization table to estimate against.
+ /// Estimated quality.
public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan target)
{
// This method can be SIMD'ified if standard table is injected as Block8x8F.
@@ -106,11 +112,10 @@ public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan target
int quality;
for (int i = 0; i < Block8x8F.Size; i++)
{
- float coeff = table[i];
- int coeffInteger = (int)coeff;
+ int coeff = (int)table[i];
// Coefficients are actually int16 casted to float numbers so there's no truncating error.
- if (coeffInteger != 0)
+ if (coeff != 0)
{
comparePercent = 100.0 * (table[i] / target[i]);
}
@@ -152,7 +157,7 @@ public static int EstimateQuality(ref Block8x8F table, ReadOnlySpan target
/// Estimated quality
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable)
- => EstimateQuality(ref luminanceTable, UnscaledQuant_Luminance);
+ => EstimateQuality(ref luminanceTable, LuminanceTable);
///
/// Estimates jpeg quality based on quantization table in zig-zag order.
@@ -161,7 +166,7 @@ public static int EstimateLuminanceQuality(ref Block8x8F luminanceTable)
/// Estimated quality
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int EstimateChrominanceQuality(ref Block8x8F chrominanceTable)
- => EstimateQuality(ref chrominanceTable, UnscaledQuant_Chrominance);
+ => EstimateQuality(ref chrominanceTable, ChrominanceTable);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int QualityToScale(int quality)
@@ -185,10 +190,10 @@ private static Block8x8F ScaleQuantizationTable(int scale, ReadOnlySpan un
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Block8x8F ScaleLuminanceTable(int quality)
- => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Luminance);
+ => ScaleQuantizationTable(scale: QualityToScale(quality), LuminanceTable);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Block8x8F ScaleChrominanceTable(int quality)
- => ScaleQuantizationTable(scale: QualityToScale(quality), UnscaledQuant_Chrominance);
+ => ScaleQuantizationTable(scale: QualityToScale(quality), ChrominanceTable);
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
new file mode 100644
index 0000000000..6577739c1a
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
@@ -0,0 +1,300 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components
+{
+ internal static partial class ZigZag
+ {
+#pragma warning disable SA1309 // naming rules violation warnings
+ ///
+ /// Special byte value to zero out elements during Sse/Avx shuffle intrinsics.
+ ///
+ private const byte _ = 0xff;
+#pragma warning restore SA1309
+
+ ///
+ /// Gets shuffle vectors for
+ /// zig zag implementation.
+ ///
+ private static ReadOnlySpan SseShuffleMasks => new byte[]
+ {
+ // row0
+ 0, 1, 2, 3, _, _, _, _, _, _, 4, 5, 6, 7, _, _,
+ _, _, _, _, 0, 1, _, _, 2, 3, _, _, _, _, 4, 5,
+ _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _,
+
+ // row1
+ _, _, _, _, _, _, _, _, _, _, _, _, 8, 9, 10, 11,
+ 2, 3, _, _, _, _, _, _, 4, 5, _, _, _, _, _, _,
+ _, _, 0, 1, _, _, 2, 3, _, _, _, _, _, _, _, _,
+
+ // row2
+ _, _, _, _, _, _, 2, 3, _, _, _, _, _, _, 4, 5,
+ _, _, _, _, _, _, _, _, 0, 1, _, _, 2, 3, _, _,
+
+ // row3
+ _, _, _, _, _, _, 12, 13, 14, 15, _, _, _, _, _, _,
+ _, _, _, _, 10, 11, _, _, _, _, 12, 13, _, _, _, _,
+ _, _, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _,
+ 6, 7, _, _, _, _, _, _, _, _, _, _, _, _, 8, 9,
+
+ // row4
+ _, _, 4, 5, _, _, _, _, _, _, _, _, 6, 7, _, _,
+ _, _, _, _, 2, 3, _, _, _, _, 4, 5, _, _, _, _,
+ _, _, _, _, _, _, 0, 1, 2, 3, _, _, _, _, _, _,
+
+ // row5
+ _, _, 12, 13, _, _, 14, 15, _, _, _, _, _, _, _, _,
+ 10, 11, _, _, _, _, _, _, 12, 13, _, _, _, _, _, _,
+
+ // row6
+ _, _, _, _, _, _, _, _, 12, 13, _, _, 14, 15, _, _,
+ _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, 12, 13,
+ 4, 5, 6, 7, _, _, _, _, _, _, _, _, _, _, _, _,
+
+ // row7
+ 10, 11, _, _, _, _, 12, 13, _, _, 14, 15, _, _, _, _,
+ _, _, 8, 9, 10, 11, _, _, _, _, _, _, 12, 13, 14, 15
+ };
+
+ ///
+ /// Gets shuffle vectors for
+ /// zig zag implementation.
+ ///
+ private static ReadOnlySpan AvxShuffleMasks => new byte[]
+ {
+ // 01_AB/01_EF/23_CD - cross-lane
+ 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0,
+
+ // 01_AB - inner-lane
+ 0, 1, 2, 3, 8, 9, _, _, 10, 11, 4, 5, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, 6, 7,
+
+ // 01_CD/23_GH - cross-lane
+ 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 0, _, _, _, _,
+
+ // 01_CD - inner-lane
+ _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _,
+
+ // 01_EF - inner-lane
+ _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _,
+
+ // 23_AB/45_CD/67_EF - cross-lane
+ 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, _, _, _, _,
+
+ // 23_AB - inner-lane
+ 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, 2, 3, 8, 9, _, _, _, _,
+
+ // 23_CD - inner-lane
+ _, _, 6, 7, 12, 13, _, _, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, 6, 7, 12, 13,
+
+ // 23_EF - inner-lane
+ _, _, _, _, _, _, 2, 3, 8, 9, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
+
+ // 23_GH - inner-lane
+ _, _, _, _, _, _, _, _, _, _, 0, 1, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
+
+ // 45_AB - inner-lane
+ _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _,
+
+ // 45_CD - inner-lane
+ _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _,
+
+ // 45_EF - cross-lane
+ 1, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, _, _, _, _, 2, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0,
+
+ // 45_EF - inner-lane
+ 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, _, _,
+
+ // 45_GH - inner-lane
+ _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, 6, 7,
+
+ // 67_CD - inner-lane
+ _, _, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,
+
+ // 67_EF - inner-lane
+ _, _, _, _, _, _, 6, 7, 0, 1, _, _, 2, 3, 8, 9, _, _, _, _, _, _, _, _, 10, 11, _, _, _, _, _, _,
+
+ // 67_GH - inner-lane
+ 8, 9, 10, 11, 4, 5, _, _, _, _, _, _, _, _, _, _, 2, 3, 8, 9, 10, 11, 4, 5, _, _, 6, 7, 12, 13, 14, 15
+ };
+
+ ///
+ /// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
+ ///
+ /// Input matrix.
+ public static unsafe void ApplyZigZagOrderingSsse3(ref Block8x8 block)
+ {
+ DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
+
+ fixed (byte* maskPtr = SseShuffleMasks)
+ {
+ Vector128 rowA = block.V0.AsByte();
+ Vector128 rowB = block.V1.AsByte();
+ Vector128