diff --git a/shared-infrastructure b/shared-infrastructure
index 48e73f455f..1f7ee70281 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 48e73f455f15eafefbe3175efc7433e5f277e506
+Subproject commit 1f7ee702812f3a1713ab7f749c0faae0ef139ed7
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index 0581993014..ef457f7ceb 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -23,6 +23,28 @@ internal static class Numerics
private const int ShuffleAlphaControl = 0b_11_11_11_11;
#endif
+#if !SUPPORTS_BITOPERATIONS
+ ///
+ /// Gets the counts the number of bits needed to hold an integer.
+ ///
+ private static ReadOnlySpan BitCountLut => new byte[]
+ {
+ 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8,
+ };
+#endif
+
///
/// Determine the Greatest CommonDivisor (GCD) of two numbers.
///
@@ -756,7 +778,7 @@ public static float Lerp(float value1, float value2, float amount)
/// widening them to 32-bit integers and performing four additions.
///
///
- /// byte(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
+ /// byte(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
/// is widened and added onto as such:
///
/// accumulator += i32(1, 2, 3, 4);
@@ -825,5 +847,26 @@ public static int EvenReduceSum(Vector256 accumulator)
return Sse2.ConvertToInt32(vsum);
}
#endif
+
+ ///
+ /// Calculates how many minimum bits needed to store given value.
+ ///
+ /// Unsigned integer to store
+ /// Minimum number of bits needed to store given value
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static int MinimumBitsToStore16(uint number)
+ {
+#if !SUPPORTS_BITOPERATIONS
+ if (number < 0x100)
+ {
+ return BitCountLut[(int)number];
+ }
+
+ return 8 + BitCountLut[(int)number >> 8];
+#else
+ const int bitInUnsignedInteger = sizeof(uint) * 8;
+ return bitInUnsignedInteger - BitOperations.LeadingZeroCount(number);
+#endif
+ }
}
}
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 4faf577fd9..b530a37e77 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -532,6 +532,7 @@ private static void Shuffle4Slice3(
///
/// Performs a multiplication and an addition of the .
///
+ /// ret = (vm0 * vm1) + va
/// The vector to add to the intermediate result.
/// The first vector to multiply.
/// The second vector to multiply.
@@ -552,6 +553,30 @@ public static Vector256 MultiplyAdd(
}
}
+ ///
+ /// Performs a multiplication and a substraction of the .
+ ///
+ /// ret = (vm0 * vm1) - vs
+ /// The vector to substract from the intermediate result.
+ /// The first vector to multiply.
+ /// The second vector to multiply.
+ /// The .
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static Vector256 MultiplySubstract(
+ in Vector256 vs,
+ in Vector256 vm0,
+ in Vector256 vm1)
+ {
+ if (Fma.IsSupported)
+ {
+ return Fma.MultiplySubtract(vm1, vm0, vs);
+ }
+ else
+ {
+ return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+ }
+ }
+
///
/// as many elements as possible, slicing them down (keeping the remainder).
///
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 2d19f5ce26..8ca7b0c801 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
///
/// Represents a Jpeg block with coefficients.
///
- [StructLayout(LayoutKind.Sequential)]
+ [StructLayout(LayoutKind.Explicit)]
internal partial struct Block8x8F : IEquatable
{
///
@@ -27,29 +27,69 @@ internal partial struct Block8x8F : IEquatable
public const int Size = 64;
#pragma warning disable SA1600 // ElementsMustBeDocumented
+ [FieldOffset(0)]
public Vector4 V0L;
+ [FieldOffset(16)]
public Vector4 V0R;
+ [FieldOffset(32)]
public Vector4 V1L;
+ [FieldOffset(48)]
public Vector4 V1R;
+ [FieldOffset(64)]
public Vector4 V2L;
+ [FieldOffset(80)]
public Vector4 V2R;
+ [FieldOffset(96)]
public Vector4 V3L;
+ [FieldOffset(112)]
public Vector4 V3R;
+ [FieldOffset(128)]
public Vector4 V4L;
+ [FieldOffset(144)]
public Vector4 V4R;
+ [FieldOffset(160)]
public Vector4 V5L;
+ [FieldOffset(176)]
public Vector4 V5R;
+ [FieldOffset(192)]
public Vector4 V6L;
+ [FieldOffset(208)]
public Vector4 V6R;
+ [FieldOffset(224)]
public Vector4 V7L;
+ [FieldOffset(240)]
public Vector4 V7R;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ ///
+ /// A number of rows of 8 scalar coefficients each in
+ ///
+ public const int RowCount = 8;
+
+ [FieldOffset(0)]
+ public Vector256 V0;
+ [FieldOffset(32)]
+ public Vector256 V1;
+ [FieldOffset(64)]
+ public Vector256 V2;
+ [FieldOffset(96)]
+ public Vector256 V3;
+ [FieldOffset(128)]
+ public Vector256 V4;
+ [FieldOffset(160)]
+ public Vector256 V5;
+ [FieldOffset(192)]
+ public Vector256 V6;
+ [FieldOffset(224)]
+ public Vector256 V7;
+#endif
#pragma warning restore SA1600 // ElementsMustBeDocumented
///
@@ -278,14 +318,14 @@ public void MultiplyInPlace(float value)
if (Avx.IsSupported)
{
var valueVec = Vector256.Create(value);
- Unsafe.As>(ref this.V0L) = Avx.Multiply(Unsafe.As>(ref this.V0L), valueVec);
- Unsafe.As>(ref this.V1L) = Avx.Multiply(Unsafe.As>(ref this.V1L), valueVec);
- Unsafe.As>(ref this.V2L) = Avx.Multiply(Unsafe.As>(ref this.V2L), valueVec);
- Unsafe.As>(ref this.V3L) = Avx.Multiply(Unsafe.As>(ref this.V3L), valueVec);
- Unsafe.As>(ref this.V4L) = Avx.Multiply(Unsafe.As>(ref this.V4L), valueVec);
- Unsafe.As>(ref this.V5L) = Avx.Multiply(Unsafe.As>(ref this.V5L), valueVec);
- Unsafe.As>(ref this.V6L) = Avx.Multiply(Unsafe.As>(ref this.V6L), valueVec);
- Unsafe.As>(ref this.V7L) = Avx.Multiply(Unsafe.As>(ref this.V7L), valueVec);
+ this.V0 = Avx.Multiply(this.V0, valueVec);
+ this.V1 = Avx.Multiply(this.V1, valueVec);
+ this.V2 = Avx.Multiply(this.V2, valueVec);
+ this.V3 = Avx.Multiply(this.V3, valueVec);
+ this.V4 = Avx.Multiply(this.V4, valueVec);
+ this.V5 = Avx.Multiply(this.V5, valueVec);
+ this.V6 = Avx.Multiply(this.V6, valueVec);
+ this.V7 = Avx.Multiply(this.V7, valueVec);
}
else
#endif
@@ -319,45 +359,14 @@ public unsafe void MultiplyInPlace(ref Block8x8F other)
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx.IsSupported)
{
- Unsafe.As>(ref this.V0L)
- = Avx.Multiply(
- Unsafe.As>(ref this.V0L),
- Unsafe.As>(ref other.V0L));
-
- Unsafe.As>(ref this.V1L)
- = Avx.Multiply(
- Unsafe.As>(ref this.V1L),
- Unsafe.As>(ref other.V1L));
-
- Unsafe.As>(ref this.V2L)
- = Avx.Multiply(
- Unsafe.As>(ref this.V2L),
- Unsafe.As>(ref other.V2L));
-
- Unsafe.As>(ref this.V3L)
- = Avx.Multiply(
- Unsafe.As>(ref this.V3L),
- Unsafe.As>(ref other.V3L));
-
- Unsafe.As>(ref this.V4L)
- = Avx.Multiply(
- Unsafe.As>(ref this.V4L),
- Unsafe.As>(ref other.V4L));
-
- Unsafe.As>(ref this.V5L)
- = Avx.Multiply(
- Unsafe.As>(ref this.V5L),
- Unsafe.As>(ref other.V5L));
-
- Unsafe.As>(ref this.V6L)
- = Avx.Multiply(
- Unsafe.As>(ref this.V6L),
- Unsafe.As>(ref other.V6L));
-
- Unsafe.As>(ref this.V7L)
- = Avx.Multiply(
- Unsafe.As>(ref this.V7L),
- Unsafe.As>(ref other.V7L));
+ this.V0 = Avx.Multiply(this.V0, other.V0);
+ this.V1 = Avx.Multiply(this.V1, other.V1);
+ this.V2 = Avx.Multiply(this.V2, other.V2);
+ this.V3 = Avx.Multiply(this.V3, other.V3);
+ this.V4 = Avx.Multiply(this.V4, other.V4);
+ this.V5 = Avx.Multiply(this.V5, other.V5);
+ this.V6 = Avx.Multiply(this.V6, other.V6);
+ this.V7 = Avx.Multiply(this.V7, other.V7);
}
else
#endif
@@ -392,14 +401,14 @@ public void AddInPlace(float value)
if (Avx.IsSupported)
{
var valueVec = Vector256.Create(value);
- Unsafe.As>(ref this.V0L) = Avx.Add(Unsafe.As>(ref this.V0L), valueVec);
- Unsafe.As>(ref this.V1L) = Avx.Add(Unsafe.As>(ref this.V1L), valueVec);
- Unsafe.As>(ref this.V2L) = Avx.Add(Unsafe.As>(ref this.V2L), valueVec);
- Unsafe.As>(ref this.V3L) = Avx.Add(Unsafe.As>(ref this.V3L), valueVec);
- Unsafe.As>(ref this.V4L) = Avx.Add(Unsafe.As>(ref this.V4L), valueVec);
- Unsafe.As>(ref this.V5L) = Avx.Add(Unsafe.As>(ref this.V5L), valueVec);
- Unsafe.As>(ref this.V6L) = Avx.Add(Unsafe.As>(ref this.V6L), valueVec);
- Unsafe.As>(ref this.V7L) = Avx.Add(Unsafe.As>(ref this.V7L), valueVec);
+ this.V0 = Avx.Add(this.V0, valueVec);
+ this.V1 = Avx.Add(this.V1, valueVec);
+ this.V2 = Avx.Add(this.V2, valueVec);
+ this.V3 = Avx.Add(this.V3, valueVec);
+ this.V4 = Avx.Add(this.V4, valueVec);
+ this.V5 = Avx.Add(this.V5, valueVec);
+ this.V6 = Avx.Add(this.V6, valueVec);
+ this.V7 = Avx.Add(this.V7, valueVec);
}
else
#endif
@@ -468,81 +477,6 @@ public static unsafe void Quantize(
DivideRoundAll(ref dest, ref qt);
}
- ///
- /// Scales the 16x16 region represented by the 4 source blocks to the 8x8 DST block.
- ///
- /// The destination block.
- /// The source block.
- public static unsafe void Scale16X16To8X8(ref Block8x8F destination, ReadOnlySpan source)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- if (Avx2.IsSupported)
- {
- Scale16X16To8X8Vectorized(ref destination, source);
- return;
- }
-#endif
-
- Scale16X16To8X8Scalar(ref destination, source);
- }
-
- private static void Scale16X16To8X8Vectorized(ref Block8x8F destination, ReadOnlySpan source)
- {
-#if SUPPORTS_RUNTIME_INTRINSICS
- Debug.Assert(Avx2.IsSupported, "AVX2 is required to execute this method");
-
- var f2 = Vector256.Create(2f);
- var f025 = Vector256.Create(0.25f);
- Vector256 switchInnerDoubleWords = Unsafe.As>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
- ref Vector256 destRef = ref Unsafe.As>(ref destination);
-
- for (int i = 0; i < 2; i++)
- {
- ref Vector256 in1 = ref Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), 2 * i));
- ref Vector256 in2 = ref Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), (2 * i) + 1));
-
- for (int j = 0; j < 8; j += 2)
- {
- Vector256 a = Unsafe.Add(ref in1, j);
- Vector256 b = Unsafe.Add(ref in1, j + 1);
- Vector256 c = Unsafe.Add(ref in2, j);
- Vector256 d = Unsafe.Add(ref in2, j + 1);
-
- Vector256 calc1 = Avx.Shuffle(a, c, 0b10_00_10_00);
- Vector256 calc2 = Avx.Shuffle(a, c, 0b11_01_11_01);
- Vector256 calc3 = Avx.Shuffle(b, d, 0b10_00_10_00);
- Vector256 calc4 = Avx.Shuffle(b, d, 0b11_01_11_01);
-
- Vector256 sum = Avx.Add(Avx.Add(calc1, calc2), Avx.Add(calc3, calc4));
- Vector256 add = Avx.Add(sum, f2);
- Vector256 res = Avx.Multiply(add, f025);
-
- destRef = Avx2.PermuteVar8x32(res, switchInnerDoubleWords);
- destRef = ref Unsafe.Add(ref destRef, 1);
- }
- }
-#endif
- }
-
- private static unsafe void Scale16X16To8X8Scalar(ref Block8x8F destination, ReadOnlySpan source)
- {
- for (int i = 0; i < 4; i++)
- {
- int dstOff = ((i & 2) << 4) | ((i & 1) << 2);
- Block8x8F iSource = source[i];
-
- for (int y = 0; y < 4; y++)
- {
- for (int x = 0; x < 4; x++)
- {
- int j = (16 * y) + (2 * x);
- float sum = iSource[j] + iSource[j + 1] + iSource[j + 8] + iSource[j + 9];
- destination[(8 * y) + x + dstOff] = (sum + 2) * .25F;
- }
- }
- }
- }
-
[MethodImpl(InliningOptions.ShortMethod)]
private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
{
@@ -553,19 +487,13 @@ private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
var vadd = Vector256.Create(.5F);
var vone = Vector256.Create(1f);
- ref Vector256 aBase = ref Unsafe.AsRef(Unsafe.As>(ref a.V0L));
- ref Vector256 bBase = ref Unsafe.AsRef(Unsafe.As>(ref b.V0L));
- ref Vector256 aEnd = ref Unsafe.Add(ref aBase, 8);
-
- do
+ for (int i = 0; i < RowCount; i++)
{
- Vector256 voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aBase), vone), vadd);
- Unsafe.Add(ref aBase, 0) = Avx.Add(Avx.Divide(aBase, bBase), voff);
-
- aBase = ref Unsafe.Add(ref aBase, 1);
- bBase = ref Unsafe.Add(ref bBase, 1);
+ ref Vector256 aRow = ref Unsafe.Add(ref a.V0, i);
+ ref Vector256 bRow = ref Unsafe.Add(ref b.V0, i);
+ Vector256 voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd);
+ aRow = Avx.Add(Avx.Divide(aRow, bRow), voff);
}
- while (Unsafe.IsAddressLessThan(ref aBase, ref aEnd));
}
else
#endif
@@ -805,26 +733,26 @@ public void TransposeInto(ref Block8x8F d)
Vector256 t0 = Avx.UnpackLow(r0, r1);
Vector256 t2 = Avx.UnpackLow(r2, r3);
Vector256 v = Avx.Shuffle(t0, t2, 0x4E);
- Unsafe.As>(ref d.V0L) = Avx.Blend(t0, v, 0xCC);
- Unsafe.As>(ref d.V1L) = Avx.Blend(t2, v, 0x33);
+ d.V0 = Avx.Blend(t0, v, 0xCC);
+ d.V1 = Avx.Blend(t2, v, 0x33);
Vector256 t4 = Avx.UnpackLow(r4, r5);
Vector256 t6 = Avx.UnpackLow(r6, r7);
v = Avx.Shuffle(t4, t6, 0x4E);
- Unsafe.As>(ref d.V4L) = Avx.Blend(t4, v, 0xCC);
- Unsafe.As>(ref d.V5L) = Avx.Blend(t6, v, 0x33);
+ d.V4 = Avx.Blend(t4, v, 0xCC);
+ d.V5 = Avx.Blend(t6, v, 0x33);
Vector256 t1 = Avx.UnpackHigh(r0, r1);
Vector256 t3 = Avx.UnpackHigh(r2, r3);
v = Avx.Shuffle(t1, t3, 0x4E);
- Unsafe.As>(ref d.V2L) = Avx.Blend(t1, v, 0xCC);
- Unsafe.As>(ref d.V3L) = Avx.Blend(t3, v, 0x33);
+ d.V2 = Avx.Blend(t1, v, 0xCC);
+ d.V3 = Avx.Blend(t3, v, 0x33);
Vector256 t5 = Avx.UnpackHigh(r4, r5);
Vector256 t7 = Avx.UnpackHigh(r6, r7);
v = Avx.Shuffle(t5, t7, 0x4E);
- Unsafe.As>(ref d.V6L) = Avx.Blend(t5, v, 0xCC);
- Unsafe.As>(ref d.V7L) = Avx.Blend(t7, v, 0x33);
+ d.V6 = Avx.Blend(t5, v, 0xCC);
+ d.V7 = Avx.Blend(t7, v, 0x33);
}
else
#endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
index bc2c7634b5..bc6c8c6cc7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
@@ -44,7 +44,7 @@ public HuffmanLut(HuffmanSpec spec)
}
}
- this.Values = new uint[maxValue + 1];
+ this.Values = new int[maxValue + 1];
int code = 0;
int k = 0;
@@ -54,7 +54,7 @@ public HuffmanLut(HuffmanSpec spec)
int bits = (i + 1) << 24;
for (int j = 0; j < spec.Count[i]; j++)
{
- this.Values[spec.Values[k]] = (uint)(bits | code);
+ this.Values[spec.Values[k]] = bits | code;
code++;
k++;
}
@@ -66,6 +66,6 @@ public HuffmanLut(HuffmanSpec spec)
///
/// Gets the collection of huffman values.
///
- public uint[] Values { get; }
+ public int[] Values { get; }
}
-}
\ No newline at end of file
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
new file mode 100644
index 0000000000..ca352397b8
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -0,0 +1,392 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System.IO;
+using System.Runtime.CompilerServices;
+using System.Threading;
+using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+ internal class HuffmanScanEncoder
+ {
+ ///
+ /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
+ ///
+ ///
+ /// This is subject to change, 1024 seems to be the best value in terms of performance.
+ /// expects it to be at least 8 (see comments in method body).
+ ///
+ private const int EmitBufferSizeInBytes = 1024;
+
+ ///
+ /// A buffer for reducing the number of stream writes when emitting Huffman tables.
+ ///
+ private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
+
+ ///
+ /// Number of filled bytes in buffer
+ ///
+ private int emitLen = 0;
+
+ ///
+ /// Emmited bits 'micro buffer' before being transfered to the .
+ ///
+ private int accumulatedBits;
+
+ ///
+ /// Number of jagged bits stored in
+ ///
+ private int bitCount;
+
+ private Block8x8F temporalBlock1;
+ private Block8x8F temporalBlock2;
+
+ ///
+ /// The output stream. All attempted writes after the first error become no-ops.
+ ///
+ private readonly Stream target;
+
+ public HuffmanScanEncoder(Stream outputStream)
+ {
+ this.target = outputStream;
+ }
+
+ ///
+ /// Encodes the image with no subsampling.
+ ///
+ /// The pixel format.
+ /// The pixel accessor providing access to the image pixels.
+ /// Luminance quantization table provided by the callee
+ /// Chrominance quantization table provided by the callee
+ /// The token to monitor for cancellation.
+ public void Encode444(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
+ where TPixel : unmanaged, IPixel
+ {
+ var unzig = ZigZag.CreateUnzigTable();
+
+ // ReSharper disable once InconsistentNaming
+ int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
+
+ ImageFrame frame = pixels.Frames.RootFrame;
+ Buffer2D pixelBuffer = frame.PixelBuffer;
+ RowOctet currentRows = default;
+
+ var pixelConverter = new YCbCrForwardConverter444(frame);
+
+ for (int y = 0; y < pixels.Height; y += 8)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ currentRows.Update(pixelBuffer, y);
+
+ for (int x = 0; x < pixels.Width; x += 8)
+ {
+ pixelConverter.Convert(x, y, ref currentRows);
+
+ prevDCY = this.WriteBlock(
+ QuantIndex.Luminance,
+ prevDCY,
+ ref pixelConverter.Y,
+ ref luminanceQuantTable,
+ ref unzig);
+
+ prevDCCb = this.WriteBlock(
+ QuantIndex.Chrominance,
+ prevDCCb,
+ ref pixelConverter.Cb,
+ ref chrominanceQuantTable,
+ ref unzig);
+
+ prevDCCr = this.WriteBlock(
+ QuantIndex.Chrominance,
+ prevDCCr,
+ ref pixelConverter.Cr,
+ ref chrominanceQuantTable,
+ ref unzig);
+ }
+ }
+
+ this.FlushInternalBuffer();
+ }
+
+ ///
+ /// Encodes the image with subsampling. The Cb and Cr components are each subsampled
+ /// at a factor of 2 both horizontally and vertically.
+ ///
+ /// The pixel format.
+ /// The pixel accessor providing access to the image pixels.
+ /// Luminance quantization table provided by the callee
+ /// Chrominance quantization table provided by the callee
+ /// The token to monitor for cancellation.
+ public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
+ where TPixel : unmanaged, IPixel
+ {
+ var unzig = ZigZag.CreateUnzigTable();
+
+ // ReSharper disable once InconsistentNaming
+ int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
+ ImageFrame frame = pixels.Frames.RootFrame;
+ Buffer2D pixelBuffer = frame.PixelBuffer;
+ RowOctet currentRows = default;
+
+ var pixelConverter = new YCbCrForwardConverter420(frame);
+
+ for (int y = 0; y < pixels.Height; y += 16)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ for (int x = 0; x < pixels.Width; x += 16)
+ {
+ for (int i = 0; i < 2; i++)
+ {
+ int yOff = i * 8;
+ currentRows.Update(pixelBuffer, y + yOff);
+ pixelConverter.Convert(x, y, ref currentRows, i);
+
+ prevDCY = this.WriteBlock(
+ QuantIndex.Luminance,
+ prevDCY,
+ ref pixelConverter.YLeft,
+ ref luminanceQuantTable,
+ ref unzig);
+
+ prevDCY = this.WriteBlock(
+ QuantIndex.Luminance,
+ prevDCY,
+ ref pixelConverter.YRight,
+ ref luminanceQuantTable,
+ ref unzig);
+ }
+
+ prevDCCb = this.WriteBlock(
+ QuantIndex.Chrominance,
+ prevDCCb,
+ ref pixelConverter.Cb,
+ ref chrominanceQuantTable,
+ ref unzig);
+
+ prevDCCr = this.WriteBlock(
+ QuantIndex.Chrominance,
+ prevDCCr,
+ ref pixelConverter.Cr,
+ ref chrominanceQuantTable,
+ ref unzig);
+ }
+ }
+
+ this.FlushInternalBuffer();
+ }
+
+ ///
+ /// Encodes the image with no chroma, just luminance.
+ ///
+ /// The pixel format.
+ /// The pixel accessor providing access to the image pixels.
+ /// Luminance quantization table provided by the callee
+ /// The token to monitor for cancellation.
+ public void EncodeGrayscale(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
+ where TPixel : unmanaged, IPixel
+ {
+ var unzig = ZigZag.CreateUnzigTable();
+
+ // ReSharper disable once InconsistentNaming
+ int prevDCY = 0;
+
+ var pixelConverter = LuminanceForwardConverter.Create();
+ ImageFrame frame = pixels.Frames.RootFrame;
+ Buffer2D pixelBuffer = frame.PixelBuffer;
+ RowOctet currentRows = default;
+
+ for (int y = 0; y < pixels.Height; y += 8)
+ {
+ cancellationToken.ThrowIfCancellationRequested();
+ currentRows.Update(pixelBuffer, y);
+
+ for (int x = 0; x < pixels.Width; x += 8)
+ {
+ pixelConverter.Convert(frame, x, y, ref currentRows);
+
+ prevDCY = this.WriteBlock(
+ QuantIndex.Luminance,
+ prevDCY,
+ ref pixelConverter.Y,
+ ref luminanceQuantTable,
+ ref unzig);
+ }
+ }
+
+ this.FlushInternalBuffer();
+ }
+
+ ///
+ /// Writes a block of pixel data using the given quantization table,
+ /// returning the post-quantized DC value of the DCT-transformed block.
+ /// The block is in natural (not zig-zag) order.
+ ///
+ /// The quantization table index.
+ /// The previous DC value.
+ /// Source block
+ /// Quantization table
+ /// The 8x8 Unzig block.
+ /// The .
+ private int WriteBlock(
+ QuantIndex index,
+ int prevDC,
+ ref Block8x8F src,
+ ref Block8x8F quant,
+ ref ZigZag unZig)
+ {
+ ref Block8x8F refTemp1 = ref this.temporalBlock1;
+ ref Block8x8F refTemp2 = ref this.temporalBlock2;
+
+ FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
+
+ Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig);
+
+ int dc = (int)refTemp2[0];
+
+ // Emit the DC delta.
+ this.EmitHuffRLE((2 * (int)index) + 0, 0, dc - prevDC);
+
+ // Emit the AC components.
+ int h = (2 * (int)index) + 1;
+ int runLength = 0;
+
+ for (int zig = 1; zig < Block8x8F.Size; zig++)
+ {
+ int ac = (int)refTemp2[zig];
+
+ if (ac == 0)
+ {
+ runLength++;
+ }
+ else
+ {
+ while (runLength > 15)
+ {
+ this.EmitHuff(h, 0xf0);
+ runLength -= 16;
+ }
+
+ this.EmitHuffRLE(h, runLength, ac);
+ runLength = 0;
+ }
+ }
+
+ if (runLength > 0)
+ {
+ this.EmitHuff(h, 0x00);
+ }
+
+ return dc;
+ }
+
+ ///
+ /// Emits the least significant count of bits to the stream write buffer.
+ /// The precondition is bits
+ ///
+ /// < 1<<nBits && nBits <= 16
+ ///
+ /// .
+ ///
+ /// The packed bits.
+ /// The number of bits
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private void Emit(int bits, int count)
+ {
+ count += this.bitCount;
+ bits <<= 32 - count;
+ bits |= this.accumulatedBits;
+
+ // Only write if more than 8 bits.
+ if (count >= 8)
+ {
+ // Track length
+ while (count >= 8)
+ {
+ byte b = (byte)(bits >> 24);
+ this.emitBuffer[this.emitLen++] = b;
+ if (b == byte.MaxValue)
+ {
+ this.emitBuffer[this.emitLen++] = byte.MinValue;
+ }
+
+ bits <<= 8;
+ count -= 8;
+ }
+
+ // This can emit 4 times of:
+ // 1 byte guaranteed
+ // 1 extra byte.MinValue byte if previous one was byte.MaxValue
+ // Thus writing (1 + 1) * 4 = 8 bytes max
+ // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write
+ if (this.emitLen > EmitBufferSizeInBytes - 8)
+ {
+ this.target.Write(this.emitBuffer, 0, this.emitLen);
+ this.emitLen = 0;
+ }
+ }
+
+ this.accumulatedBits = bits;
+ this.bitCount = count;
+ }
+
+ ///
+ /// Emits the given value with the given Huffman encoder.
+ ///
+ /// The index of the Huffman encoder
+ /// The value to encode.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private void EmitHuff(int index, int value)
+ {
+ int x = HuffmanLut.TheHuffmanLut[index].Values[value];
+ this.Emit(x & ((1 << 24) - 1), x >> 24);
+ }
+
+ ///
+ /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
+ ///
+ /// The index of the Huffman encoder
+ /// The number of copies to encode.
+ /// The value to encode.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private void EmitHuffRLE(int index, int runLength, int value)
+ {
+ int a = value;
+ int b = value;
+ if (a < 0)
+ {
+ a = -value;
+ b = value - 1;
+ }
+
+ int bt = Numerics.MinimumBitsToStore16((uint)a);
+
+ this.EmitHuff(index, (runLength << 4) | bt);
+ if (bt > 0)
+ {
+ this.Emit(b & ((1 << bt) - 1), bt);
+ }
+ }
+
+ ///
+ /// Writes remaining bytes from internal buffer to the target stream.
+ ///
+ /// Pads last byte with 1's if necessary
+ private void FlushInternalBuffer()
+ {
+ // pad last byte with 1's
+ int padBitsCount = 8 - (this.bitCount % 8);
+ if (padBitsCount != 0)
+ {
+ this.Emit((1 << padBitsCount) - 1, padBitsCount);
+ }
+
+ // flush remaining bytes
+ if (this.emitLen != 0)
+ {
+ this.target.Write(this.emitBuffer, 0, this.emitLen);
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
index cc81130dd7..fc5b9a8682 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
@@ -49,7 +49,7 @@ public void Convert(ImageFrame frame, int x, int y, ref RowOctet
ref Block8x8F yBlock = ref this.Y;
ref L8 l8Start = ref l8Span[0];
- for (int i = 0; i < 64; i++)
+ for (int i = 0; i < Block8x8F.Size; i++)
{
ref L8 c = ref Unsafe.Add(ref l8Start, i);
yBlock[i] = c.PackedValue;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 3c1a02c5aa..15574a32a2 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -92,48 +92,144 @@ public static RgbToYCbCrConverterLut Create()
return tables;
}
- ///
- /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
- ///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- private void ConvertPixelInto(
- int r,
- int g,
- int b,
- ref Block8x8F yResult,
- ref Block8x8F cbResult,
- ref Block8x8F crResult,
- int i)
+ private float CalculateY(byte r, byte g, byte b)
{
// float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
- yResult[i] = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
+ return (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
+ }
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private float CalculateCb(byte r, byte g, byte b)
+ {
// float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
- cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
+ return (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
+ }
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private float CalculateCr(byte r, byte g, byte b)
+ {
// float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
- crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
+ return (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
}
- public void Convert(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+ ///
+ /// Converts Rgb24 pixels into YCbCr color space with 4:4:4 subsampling sampling of luminance and chroma.
+ ///
+ /// Span of Rgb24 pixel data
+ /// Resulting Y values block
+ /// Resulting Cb values block
+ /// Resulting Cr values block
+ public void Convert444(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
ref Rgb24 rgbStart = ref rgbSpan[0];
- for (int i = 0; i < 64; i++)
+ for (int i = 0; i < Block8x8F.Size; i++)
{
- ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
-
- this.ConvertPixelInto(
- c.R,
- c.G,
- c.B,
- ref yBlock,
- ref cbBlock,
- ref crBlock,
- i);
+ Rgb24 c = Unsafe.Add(ref rgbStart, i);
+
+ yBlock[i] = this.CalculateY(c.R, c.G, c.B);
+ cbBlock[i] = this.CalculateCb(c.R, c.G, c.B);
+ crBlock[i] = this.CalculateCr(c.R, c.G, c.B);
}
}
+ ///
+ /// Converts Rgb24 pixels into YCbCr color space with 4:2:0 subsampling of luminance and chroma.
+ ///
+ /// Calculates 2 out of 4 luminance blocks and half of chroma blocks. This method must be called twice per 4x 8x8 DCT blocks with different row param.
+ /// Span of Rgb24 pixel data
+ /// First or "left" resulting Y block
+ /// Second or "right" resulting Y block
+ /// Resulting Cb values block
+ /// Resulting Cr values block
+ /// Row index of the 16x16 block, 0 or 1
+ public void Convert420(Span rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(row, 0, 1, nameof(row));
+
+ ref float yBlockLeftRef = ref Unsafe.As(ref yBlockLeft);
+ ref float yBlockRightRef = ref Unsafe.As(ref yBlockRight);
+
+ // 0-31 or 32-63
+ // upper or lower part
+ int chromaWriteOffset = row * (Block8x8F.Size / 2);
+ ref float cbBlockRef = ref Unsafe.Add(ref Unsafe.As(ref cbBlock), chromaWriteOffset);
+ ref float crBlockRef = ref Unsafe.Add(ref Unsafe.As(ref crBlock), chromaWriteOffset);
+
+ ref Rgb24 rgbStart = ref rgbSpan[0];
+
+ for (int i = 0; i < 8; i += 2)
+ {
+ int yBlockWriteOffset = i * 8;
+ ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, i * 16);
+
+ int chromaOffset = 8 * (i / 2);
+
+ // left
+ this.ConvertChunk420(
+ ref stride,
+ ref Unsafe.Add(ref yBlockLeftRef, yBlockWriteOffset),
+ ref Unsafe.Add(ref cbBlockRef, chromaOffset),
+ ref Unsafe.Add(ref crBlockRef, chromaOffset));
+
+ // right
+ this.ConvertChunk420(
+ ref Unsafe.Add(ref stride, 8),
+ ref Unsafe.Add(ref yBlockRightRef, yBlockWriteOffset),
+ ref Unsafe.Add(ref cbBlockRef, chromaOffset + 4),
+ ref Unsafe.Add(ref crBlockRef, chromaOffset + 4));
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, ref float cbBlock, ref float crBlock)
+ {
+ // jpeg 8x8 blocks are processed as 16x16 blocks with 16x8 subpasses (this is done for performance reasons)
+ // each row is 16 pixels wide thus +16 stride reference offset
+ // resulting luminance (Y`) are sampled at original resolution thus +8 reference offset
+ for (int k = 0; k < 8; k += 2)
+ {
+ ref float yBlockRef = ref Unsafe.Add(ref yBlock, k);
+
+ // top row
+ Rgb24 px0 = Unsafe.Add(ref stride, k);
+ Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+ yBlockRef = this.CalculateY(px0.R, px0.G, px0.B);
+ Unsafe.Add(ref yBlockRef, 1) = this.CalculateY(px1.R, px1.G, px1.B);
+
+ // bottom row
+ Rgb24 px2 = Unsafe.Add(ref stride, k + 16);
+ Rgb24 px3 = Unsafe.Add(ref stride, k + 17);
+ Unsafe.Add(ref yBlockRef, 8) = this.CalculateY(px2.R, px2.G, px2.B);
+ Unsafe.Add(ref yBlockRef, 9) = this.CalculateY(px3.R, px3.G, px3.B);
+
+ // chroma average for 2x2 pixel block
+ Unsafe.Add(ref cbBlock, k / 2) = this.CalculateAverageCb(px0, px1, px2, px3);
+ Unsafe.Add(ref crBlock, k / 2) = this.CalculateAverageCr(px0, px1, px2, px3);
+ }
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private float CalculateAverageCb(Rgb24 px0, Rgb24 px1, Rgb24 px2, Rgb24 px3)
+ {
+ return 0.25f
+ * (this.CalculateCb(px0.R, px0.G, px0.B)
+ + this.CalculateCb(px1.R, px1.G, px1.B)
+ + this.CalculateCb(px2.R, px2.G, px2.B)
+ + this.CalculateCb(px3.R, px3.G, px3.B));
+ }
+
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ private float CalculateAverageCr(Rgb24 px0, Rgb24 px1, Rgb24 px2, Rgb24 px3)
+ {
+ return 0.25f
+ * (this.CalculateCr(px0.R, px0.G, px0.B)
+ + this.CalculateCr(px1.R, px1.G, px1.B)
+ + this.CalculateCr(px2.R, px2.G, px2.B)
+ + this.CalculateCr(px3.R, px3.G, px3.B));
+ }
+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Fix(float x)
=> (int)((x * (1L << ScaleBits)) + 0.5F);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 209cc3c6ab..926e7d5a4a 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
using System;
@@ -27,19 +27,45 @@ public static bool IsSupported
}
}
+ public static int AvxCompatibilityPadding
+ {
+ // rgb byte matrices contain 8 strides by 8 pixels each, thus 64 pixels total
+ // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
+ // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
+ // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
+ // stride 0 0 - 192 -(+64bits)-> 256
+ // stride 1 192 - 384 -(+64bits)-> 448
+ // stride 2 384 - 576 -(+64bits)-> 640
+ // stride 3 576 - 768 -(+64bits)-> 832
+ // stride 4 768 - 960 -(+64bits)-> 1024
+ // stride 5 960 - 1152 -(+64bits)-> 1216
+ // stride 6 1152 - 1344 -(+64bits)-> 1408
+ // stride 7 1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
+ //
+ // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
+ // This is not permitted - we are reading foreign memory
+ //
+ // 8 byte padding to rgb byte span will solve this problem without extra code in converters
+ get
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (IsSupported)
+ {
+ return 8;
+ }
+#endif
+ return 0;
+ }
+ }
+
#if SUPPORTS_RUNTIME_INTRINSICS
+
private static ReadOnlySpan MoveFirst24BytesToSeparateLanes => new byte[]
{
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
};
- private static ReadOnlySpan MoveLast24BytesToSeparateLanes => new byte[]
- {
- 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
- 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
- };
-
private static ReadOnlySpan ExtractRgb => new byte[]
{
0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
@@ -47,7 +73,15 @@ public static bool IsSupported
};
#endif
- public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+ ///
+ /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:4:4 subsampling
+ ///
+ /// Total size of rgb span must be 200 bytes
+ /// Span of rgb pixels with size of 64
+ /// 8x8 destination matrix of Luminance(Y) converted data
+ /// 8x8 destination matrix of Chrominance(Cb) converted data
+ /// 8x8 destination matrix of Chrominance(Cr) converted data
+ public static void Convert444(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
{
Debug.Assert(IsSupported, "AVX2 is required to run this converter");
@@ -63,18 +97,20 @@ public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, re
var f05 = Vector256.Create(0.5f);
var zero = Vector256.Create(0).AsByte();
- ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan));
- ref Vector256 destYRef = ref Unsafe.As>(ref yBlock);
- ref Vector256 destCbRef = ref Unsafe.As>(ref cbBlock);
- ref Vector256 destCrRef = ref Unsafe.As>(ref crBlock);
+ ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan));
+ ref Vector256 destYRef = ref yBlock.V0;
+ ref Vector256 destCbRef = ref cbBlock.V0;
+ ref Vector256 destCrRef = ref crBlock.V0;
var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb));
Vector256 rgb, rg, bx;
Vector256 r, g, b;
- for (int i = 0; i < 7; i++)
+
+ const int bytesPerRgbStride = 24;
+ for (int i = 0; i < 8; i++)
{
- rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();
+ rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * i)).AsUInt32(), extractToLanesMask).AsByte();
rgb = Avx2.Shuffle(rgb, extractRgbMask);
@@ -94,27 +130,130 @@ public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, re
// 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
}
+#endif
+ }
+
+ ///
+ /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
+ ///
+ public static void Convert420(ReadOnlySpan rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+ {
+ Debug.Assert(IsSupported, "AVX2 is required to run this converter");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ var f0299 = Vector256.Create(0.299f);
+ var f0587 = Vector256.Create(0.587f);
+ var f0114 = Vector256.Create(0.114f);
+ var fn0168736 = Vector256.Create(-0.168736f);
+ var fn0331264 = Vector256.Create(-0.331264f);
+ var f128 = Vector256.Create(128f);
+ var fn0418688 = Vector256.Create(-0.418688f);
+ var fn0081312F = Vector256.Create(-0.081312F);
+ var f05 = Vector256.Create(0.5f);
+ var zero = Vector256.Create(0).AsByte();
+
+ ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan));
+
+ int destOffset = row * 4;
- extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
- rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
- rgb = Avx2.Shuffle(rgb, extractRgbMask);
+ ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), destOffset);
+ ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), destOffset);
- rg = Avx2.UnpackLow(rgb, zero);
- bx = Avx2.UnpackHigh(rgb, zero);
+ var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
+ var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb));
+ Vector256 rgb, rg, bx;
+ Vector256 r, g, b;
+
+ Span> rDataLanes = stackalloc Vector256[4];
+ Span> gDataLanes = stackalloc Vector256[4];
+ Span> bDataLanes = stackalloc Vector256[4];
+
+ const int bytesPerRgbStride = 24;
+ for (int i = 0; i < 4; i++)
+ {
+ // 16x2 => 8x1
+ // left 8x8 column conversions
+ for (int j = 0; j < 4; j += 2)
+ {
+ rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
+
+ rgb = Avx2.Shuffle(rgb, extractRgbMask);
+
+ rg = Avx2.UnpackLow(rgb, zero);
+ bx = Avx2.UnpackHigh(rgb, zero);
+
+ r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+ g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+ b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
+
+ int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
+
+ // (0.299F * r) + (0.587F * g) + (0.114F * b);
+ Unsafe.Add(ref yBlockLeft.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+
+ rDataLanes[j] = r;
+ gDataLanes[j] = g;
+ bDataLanes[j] = b;
+ }
+
+ // 16x2 => 8x1
+ // right 8x8 column conversions
+ for (int j = 1; j < 4; j += 2)
+ {
+ rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
+
+ rgb = Avx2.Shuffle(rgb, extractRgbMask);
+
+ rg = Avx2.UnpackLow(rgb, zero);
+ bx = Avx2.UnpackHigh(rgb, zero);
- r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
- g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
- b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
+ r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+ g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+ b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
- // (0.299F * r) + (0.587F * g) + (0.114F * b);
- Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+ int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
- // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
- Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
+ // (0.299F * r) + (0.587F * g) + (0.114F * b);
+ Unsafe.Add(ref yBlockRight.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
- // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
- Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+ rDataLanes[j] = r;
+ gDataLanes[j] = g;
+ bDataLanes[j] = b;
+ }
+
+ r = Scale16x2_8x1(rDataLanes);
+ g = Scale16x2_8x1(gDataLanes);
+ b = Scale16x2_8x1(bDataLanes);
+
+ // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+ Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
+
+ // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+ Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+ }
#endif
}
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ ///
+ /// Scales 16x2 matrix to 8x1 using 2x2 average
+ ///
+ /// Input matrix consisting of 4 256bit vectors
+ /// 256bit vector containing upper and lower scaled parts of the input matrix
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ internal static Vector256 Scale16x2_8x1(ReadOnlySpan> v)
+ {
+ Debug.Assert(Avx2.IsSupported, "AVX2 is required to run this converter");
+ DebugGuard.IsTrue(v.Length == 4, "Input span must consist of 4 elements");
+
+ var f025 = Vector256.Create(0.25f);
+
+ Vector256 left = Avx.Add(v[0], v[2]);
+ Vector256 right = Avx.Add(v[1], v[3]);
+ Vector256 avg2x2 = Avx.Multiply(Avx.HorizontalAdd(left, right), f025);
+
+ return Avx2.Permute4x64(avg2x2.AsDouble(), 0b11_01_10_00).AsSingle();
+ }
+#endif
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
new file mode 100644
index 0000000000..a4abd532b3
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -0,0 +1,121 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+ ///
+ /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
+ ///
+ /// The pixel type to work on
+ internal ref struct YCbCrForwardConverter420
+ where TPixel : unmanaged, IPixel
+ {
+ ///
+ /// Number of pixels processed per single call
+ ///
+ private const int PixelsPerSample = 16 * 8;
+
+ ///
+ /// Total byte size of processed pixels converted from TPixel to
+ ///
+ private const int RgbSpanByteSize = PixelsPerSample * 3;
+
+ ///
+ /// of sampling area from given frame pixel buffer
+ ///
+ private static readonly Size SampleSize = new Size(16, 8);
+
+ ///
+ /// The left Y component
+ ///
+ public Block8x8F YLeft;
+
+ ///
+ /// The left Y component
+ ///
+ public Block8x8F YRight;
+
+ ///
+ /// The Cb component
+ ///
+ public Block8x8F Cb;
+
+ ///
+ /// The Cr component
+ ///
+ public Block8x8F Cr;
+
+ ///
+ /// The color conversion tables
+ ///
+ private RgbToYCbCrConverterLut colorTables;
+
+ ///
+ /// Temporal 16x8 block to hold TPixel data
+ ///
+ private Span pixelSpan;
+
+ ///
+ /// Temporal RGB block
+ ///
+ private Span rgbSpan;
+
+ ///
+ /// Sampled pixel buffer size
+ ///
+ private Size samplingAreaSize;
+
+ ///
+ /// for internal operations
+ ///
+ private Configuration config;
+
+ public YCbCrForwardConverter420(ImageFrame frame)
+ {
+ // matrices would be filled during convert calls
+ this.YLeft = default;
+ this.YRight = default;
+ this.Cb = default;
+ this.Cr = default;
+
+ // temporal pixel buffers
+ this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
+ this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
+
+ // frame data
+ this.samplingAreaSize = new Size(frame.Width, frame.Height);
+ this.config = frame.GetConfiguration();
+
+ // conversion vector fallback data
+ if (!RgbToYCbCrConverterVectorized.IsSupported)
+ {
+ this.colorTables = RgbToYCbCrConverterLut.Create();
+ }
+ else
+ {
+ this.colorTables = default;
+ }
+ }
+
+ public void Convert(int x, int y, ref RowOctet currentRows, int idx)
+ {
+ YCbCrForwardConverter.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
+
+ PixelOperations.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
+
+ if (RgbToYCbCrConverterVectorized.IsSupported)
+ {
+ RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
+ }
+ else
+ {
+ this.colorTables.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
new file mode 100644
index 0000000000..ef589272bd
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -0,0 +1,122 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+ ///
+ /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
+ ///
+ /// The pixel type to work on
+ internal ref struct YCbCrForwardConverter444
+ where TPixel : unmanaged, IPixel
+ {
+ ///
+ /// Number of pixels processed per single call
+ ///
+ private const int PixelsPerSample = 8 * 8;
+
+ ///
+ /// Total byte size of processed pixels converted from TPixel to
+ ///
+ private const int RgbSpanByteSize = PixelsPerSample * 3;
+
+ ///
+ /// of sampling area from given frame pixel buffer
+ ///
+ private static readonly Size SampleSize = new Size(8, 8);
+
+ ///
+ /// The Y component
+ ///
+ public Block8x8F Y;
+
+ ///
+ /// The Cb component
+ ///
+ public Block8x8F Cb;
+
+ ///
+ /// The Cr component
+ ///
+ public Block8x8F Cr;
+
+ ///
+ /// The color conversion tables
+ ///
+ private RgbToYCbCrConverterLut colorTables;
+
+ ///
+ /// Temporal 64-byte span to hold unconverted TPixel data
+ ///
+ private Span pixelSpan;
+
+ ///
+ /// Temporal 64-byte span to hold converted Rgb24 data
+ ///
+ private Span rgbSpan;
+
+ ///
+ /// Sampled pixel buffer size
+ ///
+ private Size samplingAreaSize;
+
+ ///
+ /// for internal operations
+ ///
+ private Configuration config;
+
+ public YCbCrForwardConverter444(ImageFrame frame)
+ {
+ // matrices would be filled during convert calls
+ this.Y = default;
+ this.Cb = default;
+ this.Cr = default;
+
+ // temporal pixel buffers
+ this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
+ this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
+
+ // frame data
+ this.samplingAreaSize = new Size(frame.Width, frame.Height);
+ this.config = frame.GetConfiguration();
+
+ // conversion vector fallback data
+ if (!RgbToYCbCrConverterVectorized.IsSupported)
+ {
+ this.colorTables = RgbToYCbCrConverterLut.Create();
+ }
+ else
+ {
+ this.colorTables = default;
+ }
+ }
+
+ ///
+ /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (, , )
+ ///
+ public void Convert(int x, int y, ref RowOctet currentRows)
+ {
+ YCbCrForwardConverter.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
+
+ PixelOperations.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
+
+ ref Block8x8F yBlock = ref this.Y;
+ ref Block8x8F cbBlock = ref this.Cb;
+ ref Block8x8F crBlock = ref this.Cr;
+
+ if (RgbToYCbCrConverterVectorized.IsSupported)
+ {
+ RgbToYCbCrConverterVectorized.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+ }
+ else
+ {
+ this.colorTables.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+ }
+ }
+ }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 81e64b277b..f5ef770914 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -2,81 +2,59 @@
// Licensed under the Apache License, Version 2.0.
using System;
-using SixLabors.ImageSharp.Advanced;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
using SixLabors.ImageSharp.PixelFormats;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
{
- ///
- /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
- ///
- /// The pixel type to work on
- internal ref struct YCbCrForwardConverter
+ internal static class YCbCrForwardConverter
where TPixel : unmanaged, IPixel
{
- ///
- /// The Y component
- ///
- public Block8x8F Y;
-
- ///
- /// The Cb component
- ///
- public Block8x8F Cb;
-
- ///
- /// The Cr component
- ///
- public Block8x8F Cr;
+ public static void LoadAndStretchEdges(RowOctet source, Span dest, Point start, Size sampleSize, Size totalSize)
+ {
+ DebugGuard.MustBeBetweenOrEqualTo(start.X, 1, totalSize.Width - 1, nameof(start.X));
+ DebugGuard.MustBeBetweenOrEqualTo(start.Y, 1, totalSize.Height - 1, nameof(start.Y));
- ///
- /// The color conversion tables
- ///
- private RgbToYCbCrConverterLut colorTables;
+ int width = Math.Min(sampleSize.Width, totalSize.Width - start.X);
+ int height = Math.Min(sampleSize.Height, totalSize.Height - start.Y);
- ///
- /// Temporal 8x8 block to hold TPixel data
- ///
- private GenericBlock8x8 pixelBlock;
+ uint byteWidth = (uint)(width * Unsafe.SizeOf());
+ int remainderXCount = sampleSize.Width - width;
- ///
- /// Temporal RGB block
- ///
- private GenericBlock8x8 rgbBlock;
+ ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast(dest));
+ int rowSizeInBytes = sampleSize.Width * Unsafe.SizeOf();
- public static YCbCrForwardConverter Create()
- {
- var result = default(YCbCrForwardConverter);
- if (!RgbToYCbCrConverterVectorized.IsSupported)
+ for (int y = 0; y < height; y++)
{
- // Avoid creating lookup tables, when vectorized converter is supported
- result.colorTables = RgbToYCbCrConverterLut.Create();
- }
+ Span row = source[y];
- return result;
- }
+ ref byte s = ref Unsafe.As(ref row[start.X]);
+ ref byte d = ref Unsafe.Add(ref blockStart, y * rowSizeInBytes);
- ///
- /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (, , )
- ///
- public void Convert(ImageFrame frame, int x, int y, ref RowOctet currentRows)
- {
- this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows);
+ Unsafe.CopyBlock(ref d, ref s, byteWidth);
+
+ ref TPixel last = ref Unsafe.Add(ref Unsafe.As(ref d), width - 1);
- Span rgbSpan = this.rgbBlock.AsSpanUnsafe();
- PixelOperations.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), rgbSpan);
+ for (int x = 1; x <= remainderXCount; x++)
+ {
+ Unsafe.Add(ref last, x) = last;
+ }
+ }
- ref Block8x8F yBlock = ref this.Y;
- ref Block8x8F cbBlock = ref this.Cb;
- ref Block8x8F crBlock = ref this.Cr;
+ int remainderYCount = sampleSize.Height - height;
- if (RgbToYCbCrConverterVectorized.IsSupported)
+ if (remainderYCount == 0)
{
- RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+ return;
}
- else
+
+ ref byte lastRowStart = ref Unsafe.Add(ref blockStart, (height - 1) * rowSizeInBytes);
+
+ for (int y = 1; y <= remainderYCount; y++)
{
- this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+ ref byte remStart = ref Unsafe.Add(ref lastRowStart, rowSizeInBytes * y);
+ Unsafe.CopyBlock(ref remStart, ref lastRowStart, (uint)rowSizeInBytes);
}
}
}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index a6d0622dd8..f31d07efca 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,8 +1,13 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
+using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Jpeg.Components
@@ -10,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
///
/// Contains inaccurate, but fast forward and inverse DCT implementations.
///
- internal static class FastFloatingPointDCT
+ internal static partial class FastFloatingPointDCT
{
#pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
private const float C_1_175876 = 1.175875602f;
@@ -38,147 +43,31 @@ internal static class FastFloatingPointDCT
private const float C_0_765367 = 0.765366865f;
private const float C_0_125 = 0.1250f;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+ private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f);
+ private static readonly Vector256 C_V_1_3065 = Vector256.Create(1.306563f);
+ private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f);
+ private static readonly Vector256 C_V_0_7856 = Vector256.Create(0.785695f);
+ private static readonly Vector256 C_V_1_3870 = Vector256.Create(1.387040f);
+ private static readonly Vector256 C_V_0_2758 = Vector256.Create(0.275899f);
+
+ private static readonly Vector256 C_V_n1_9615 = Vector256.Create(-1.961570560f);
+ private static readonly Vector256 C_V_n0_3901 = Vector256.Create(-0.390180644f);
+ private static readonly Vector256 C_V_n0_8999 = Vector256.Create(-0.899976223f);
+ private static readonly Vector256 C_V_n2_5629 = Vector256.Create(-2.562915447f);
+ private static readonly Vector256 C_V_0_2986 = Vector256.Create(0.298631336f);
+ private static readonly Vector256 C_V_2_0531 = Vector256.Create(2.053119869f);
+ private static readonly Vector256 C_V_3_0727 = Vector256.Create(3.072711026f);
+ private static readonly Vector256 C_V_1_5013 = Vector256.Create(1.501321110f);
+ private static readonly Vector256 C_V_n1_8477 = Vector256.Create(-1.847759065f);
+ private static readonly Vector256