diff --git a/shared-infrastructure b/shared-infrastructure
index 48e73f455f..1f7ee70281 160000
--- a/shared-infrastructure
+++ b/shared-infrastructure
@@ -1 +1 @@
-Subproject commit 48e73f455f15eafefbe3175efc7433e5f277e506
+Subproject commit 1f7ee702812f3a1713ab7f749c0faae0ef139ed7
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index 0581993014..ef457f7ceb 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -23,6 +23,28 @@ internal static class Numerics
         private const int ShuffleAlphaControl = 0b_11_11_11_11;
 #endif
 
+#if !SUPPORTS_BITOPERATIONS
+        /// <summary>
+        /// Gets the counts the number of bits needed to hold an integer.
+        /// </summary>
+        private static ReadOnlySpan<byte> BitCountLut => new byte[]
+        {
+            0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8,
+        };
+#endif
+
         /// <summary>
         /// Determine the Greatest CommonDivisor (GCD) of two numbers.
         /// </summary>
@@ -756,7 +778,7 @@ public static float Lerp(float value1, float value2, float amount)
         /// widening them to 32-bit integers and performing four additions.
         /// </summary>
         /// <remarks>
-        /// <code>byte(1, 2, 3, 4,  5, 6, 7, 8,  9, 10, 11, 12,  13, 14, 15, 16)</code>
+        /// <c>byte(1, 2, 3, 4,  5, 6, 7, 8,  9, 10, 11, 12,  13, 14, 15, 16)</c>
         /// is widened and added onto <paramref name="accumulator"/> as such:
         /// <code>
         ///  accumulator += i32(1, 2, 3, 4);
@@ -825,5 +847,26 @@ public static int EvenReduceSum(Vector256<int> accumulator)
             return Sse2.ConvertToInt32(vsum);
         }
 #endif
+
+        /// <summary>
+        /// Calculates how many minimum bits needed to store given value.
+        /// </summary>
+        /// <param name="number">Unsigned integer to store</param>
+        /// <returns>Minimum number of bits needed to store given value</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int MinimumBitsToStore16(uint number)
+        {
+#if !SUPPORTS_BITOPERATIONS
+            if (number < 0x100)
+            {
+                return BitCountLut[(int)number];
+            }
+
+            return 8 + BitCountLut[(int)number >> 8];
+#else
+            const int bitInUnsignedInteger = sizeof(uint) * 8;
+            return bitInUnsignedInteger - BitOperations.LeadingZeroCount(number);
+#endif
+        }
     }
 }
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 4faf577fd9..b530a37e77 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -532,6 +532,7 @@ private static void Shuffle4Slice3(
             /// <summary>
             /// Performs a multiplication and an addition of the <see cref="Vector256{T}"/>.
             /// </summary>
+            /// <remarks>ret = (vm0 * vm1) + va</remarks>
             /// <param name="va">The vector to add to the intermediate result.</param>
             /// <param name="vm0">The first vector to multiply.</param>
             /// <param name="vm1">The second vector to multiply.</param>
@@ -552,6 +553,30 @@ public static Vector256<float> MultiplyAdd(
                 }
             }
 
+            /// <summary>
+            /// Performs a multiplication and a substraction of the <see cref="Vector256{T}"/>.
+            /// </summary>
+            /// <remarks>ret = (vm0 * vm1) - vs</remarks>
+            /// <param name="vs">The vector to substract from the intermediate result.</param>
+            /// <param name="vm0">The first vector to multiply.</param>
+            /// <param name="vm1">The second vector to multiply.</param>
+            /// <returns>The <see cref="Vector256{T}"/>.</returns>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public static Vector256<float> MultiplySubstract(
+                in Vector256<float> vs,
+                in Vector256<float> vm0,
+                in Vector256<float> vm1)
+            {
+                if (Fma.IsSupported)
+                {
+                    return Fma.MultiplySubtract(vm1, vm0, vs);
+                }
+                else
+                {
+                    return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
+                }
+            }
+
             /// <summary>
             /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
             /// </summary>
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
index 2d19f5ce26..8ca7b0c801 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
     /// <summary>
     /// Represents a Jpeg block with <see cref="float"/> coefficients.
     /// </summary>
-    [StructLayout(LayoutKind.Sequential)]
+    [StructLayout(LayoutKind.Explicit)]
     internal partial struct Block8x8F : IEquatable<Block8x8F>
     {
         /// <summary>
@@ -27,29 +27,69 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
         public const int Size = 64;
 
 #pragma warning disable SA1600 // ElementsMustBeDocumented
+        [FieldOffset(0)]
         public Vector4 V0L;
+        [FieldOffset(16)]
         public Vector4 V0R;
 
+        [FieldOffset(32)]
         public Vector4 V1L;
+        [FieldOffset(48)]
         public Vector4 V1R;
 
+        [FieldOffset(64)]
         public Vector4 V2L;
+        [FieldOffset(80)]
         public Vector4 V2R;
 
+        [FieldOffset(96)]
         public Vector4 V3L;
+        [FieldOffset(112)]
         public Vector4 V3R;
 
+        [FieldOffset(128)]
         public Vector4 V4L;
+        [FieldOffset(144)]
         public Vector4 V4R;
 
+        [FieldOffset(160)]
         public Vector4 V5L;
+        [FieldOffset(176)]
         public Vector4 V5R;
 
+        [FieldOffset(192)]
         public Vector4 V6L;
+        [FieldOffset(208)]
         public Vector4 V6R;
 
+        [FieldOffset(224)]
         public Vector4 V7L;
+        [FieldOffset(240)]
         public Vector4 V7R;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
+        /// </summary>
+        public const int RowCount = 8;
+
+        [FieldOffset(0)]
+        public Vector256<float> V0;
+        [FieldOffset(32)]
+        public Vector256<float> V1;
+        [FieldOffset(64)]
+        public Vector256<float> V2;
+        [FieldOffset(96)]
+        public Vector256<float> V3;
+        [FieldOffset(128)]
+        public Vector256<float> V4;
+        [FieldOffset(160)]
+        public Vector256<float> V5;
+        [FieldOffset(192)]
+        public Vector256<float> V6;
+        [FieldOffset(224)]
+        public Vector256<float> V7;
+#endif
 #pragma warning restore SA1600 // ElementsMustBeDocumented
 
         /// <summary>
@@ -278,14 +318,14 @@ public void MultiplyInPlace(float value)
             if (Avx.IsSupported)
             {
                 var valueVec = Vector256.Create(value);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V0L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V0L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V1L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V1L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V2L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V2L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V3L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V3L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V4L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V4L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V5L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V5L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V6L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V6L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V7L) = Avx.Multiply(Unsafe.As<Vector4, Vector256<float>>(ref this.V7L), valueVec);
+                this.V0 = Avx.Multiply(this.V0, valueVec);
+                this.V1 = Avx.Multiply(this.V1, valueVec);
+                this.V2 = Avx.Multiply(this.V2, valueVec);
+                this.V3 = Avx.Multiply(this.V3, valueVec);
+                this.V4 = Avx.Multiply(this.V4, valueVec);
+                this.V5 = Avx.Multiply(this.V5, valueVec);
+                this.V6 = Avx.Multiply(this.V6, valueVec);
+                this.V7 = Avx.Multiply(this.V7, valueVec);
             }
             else
 #endif
@@ -319,45 +359,14 @@ public unsafe void MultiplyInPlace(ref Block8x8F other)
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Avx.IsSupported)
             {
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V0L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V0L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V0L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V1L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V1L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V1L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V2L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V2L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V2L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V3L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V3L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V3L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V4L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V4L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V4L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V5L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V5L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V5L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V6L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V6L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V6L));
-
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V7L)
-                    = Avx.Multiply(
-                        Unsafe.As<Vector4, Vector256<float>>(ref this.V7L),
-                        Unsafe.As<Vector4, Vector256<float>>(ref other.V7L));
+                this.V0 = Avx.Multiply(this.V0, other.V0);
+                this.V1 = Avx.Multiply(this.V1, other.V1);
+                this.V2 = Avx.Multiply(this.V2, other.V2);
+                this.V3 = Avx.Multiply(this.V3, other.V3);
+                this.V4 = Avx.Multiply(this.V4, other.V4);
+                this.V5 = Avx.Multiply(this.V5, other.V5);
+                this.V6 = Avx.Multiply(this.V6, other.V6);
+                this.V7 = Avx.Multiply(this.V7, other.V7);
             }
             else
 #endif
@@ -392,14 +401,14 @@ public void AddInPlace(float value)
             if (Avx.IsSupported)
             {
                 var valueVec = Vector256.Create(value);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V0L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V0L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V1L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V1L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V2L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V2L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V3L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V3L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V4L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V4L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V5L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V5L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V6L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V6L), valueVec);
-                Unsafe.As<Vector4, Vector256<float>>(ref this.V7L) = Avx.Add(Unsafe.As<Vector4, Vector256<float>>(ref this.V7L), valueVec);
+                this.V0 = Avx.Add(this.V0, valueVec);
+                this.V1 = Avx.Add(this.V1, valueVec);
+                this.V2 = Avx.Add(this.V2, valueVec);
+                this.V3 = Avx.Add(this.V3, valueVec);
+                this.V4 = Avx.Add(this.V4, valueVec);
+                this.V5 = Avx.Add(this.V5, valueVec);
+                this.V6 = Avx.Add(this.V6, valueVec);
+                this.V7 = Avx.Add(this.V7, valueVec);
             }
             else
 #endif
@@ -468,81 +477,6 @@ public static unsafe void Quantize(
             DivideRoundAll(ref dest, ref qt);
         }
 
-        /// <summary>
-        /// Scales the 16x16 region represented by the 4 source blocks to the 8x8 DST block.
-        /// </summary>
-        /// <param name="destination">The destination block.</param>
-        /// <param name="source">The source block.</param>
-        public static unsafe void Scale16X16To8X8(ref Block8x8F destination, ReadOnlySpan<Block8x8F> source)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            if (Avx2.IsSupported)
-            {
-                Scale16X16To8X8Vectorized(ref destination, source);
-                return;
-            }
-#endif
-
-            Scale16X16To8X8Scalar(ref destination, source);
-        }
-
-        private static void Scale16X16To8X8Vectorized(ref Block8x8F destination, ReadOnlySpan<Block8x8F> source)
-        {
-#if SUPPORTS_RUNTIME_INTRINSICS
-            Debug.Assert(Avx2.IsSupported, "AVX2 is required to execute this method");
-
-            var f2 = Vector256.Create(2f);
-            var f025 = Vector256.Create(0.25f);
-            Vector256<int> switchInnerDoubleWords = Unsafe.As<byte, Vector256<int>>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32));
-            ref Vector256<float> destRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref destination);
-
-            for (int i = 0; i < 2; i++)
-            {
-                ref Vector256<float> in1 = ref Unsafe.As<Block8x8F, Vector256<float>>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), 2 * i));
-                ref Vector256<float> in2 = ref Unsafe.As<Block8x8F, Vector256<float>>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), (2 * i) + 1));
-
-                for (int j = 0; j < 8; j += 2)
-                {
-                    Vector256<float> a = Unsafe.Add(ref in1, j);
-                    Vector256<float> b = Unsafe.Add(ref in1, j + 1);
-                    Vector256<float> c = Unsafe.Add(ref in2, j);
-                    Vector256<float> d = Unsafe.Add(ref in2, j + 1);
-
-                    Vector256<float> calc1 = Avx.Shuffle(a, c, 0b10_00_10_00);
-                    Vector256<float> calc2 = Avx.Shuffle(a, c, 0b11_01_11_01);
-                    Vector256<float> calc3 = Avx.Shuffle(b, d, 0b10_00_10_00);
-                    Vector256<float> calc4 = Avx.Shuffle(b, d, 0b11_01_11_01);
-
-                    Vector256<float> sum = Avx.Add(Avx.Add(calc1, calc2), Avx.Add(calc3, calc4));
-                    Vector256<float> add = Avx.Add(sum, f2);
-                    Vector256<float> res = Avx.Multiply(add, f025);
-
-                    destRef = Avx2.PermuteVar8x32(res, switchInnerDoubleWords);
-                    destRef = ref Unsafe.Add(ref destRef, 1);
-                }
-            }
-#endif
-        }
-
-        private static unsafe void Scale16X16To8X8Scalar(ref Block8x8F destination, ReadOnlySpan<Block8x8F> source)
-        {
-            for (int i = 0; i < 4; i++)
-            {
-                int dstOff = ((i & 2) << 4) | ((i & 1) << 2);
-                Block8x8F iSource = source[i];
-
-                for (int y = 0; y < 4; y++)
-                {
-                    for (int x = 0; x < 4; x++)
-                    {
-                        int j = (16 * y) + (2 * x);
-                        float sum = iSource[j] + iSource[j + 1] + iSource[j + 8] + iSource[j + 9];
-                        destination[(8 * y) + x + dstOff] = (sum + 2) * .25F;
-                    }
-                }
-            }
-        }
-
         [MethodImpl(InliningOptions.ShortMethod)]
         private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
         {
@@ -553,19 +487,13 @@ private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b)
                 var vadd = Vector256.Create(.5F);
                 var vone = Vector256.Create(1f);
 
-                ref Vector256<float> aBase = ref Unsafe.AsRef(Unsafe.As<Vector4, Vector256<float>>(ref a.V0L));
-                ref Vector256<float> bBase = ref Unsafe.AsRef(Unsafe.As<Vector4, Vector256<float>>(ref b.V0L));
-                ref Vector256<float> aEnd = ref Unsafe.Add(ref aBase, 8);
-
-                do
+                for (int i = 0; i < RowCount; i++)
                 {
-                    Vector256<float> voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aBase), vone), vadd);
-                    Unsafe.Add(ref aBase, 0) = Avx.Add(Avx.Divide(aBase, bBase), voff);
-
-                    aBase = ref Unsafe.Add(ref aBase, 1);
-                    bBase = ref Unsafe.Add(ref bBase, 1);
+                    ref Vector256<float> aRow = ref Unsafe.Add(ref a.V0, i);
+                    ref Vector256<float> bRow = ref Unsafe.Add(ref b.V0, i);
+                    Vector256<float> voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd);
+                    aRow = Avx.Add(Avx.Divide(aRow, bRow), voff);
                 }
-                while (Unsafe.IsAddressLessThan(ref aBase, ref aEnd));
             }
             else
 #endif
@@ -805,26 +733,26 @@ public void TransposeInto(ref Block8x8F d)
                 Vector256<float> t0 = Avx.UnpackLow(r0, r1);
                 Vector256<float> t2 = Avx.UnpackLow(r2, r3);
                 Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V0L) = Avx.Blend(t0, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V1L) = Avx.Blend(t2, v, 0x33);
+                d.V0 = Avx.Blend(t0, v, 0xCC);
+                d.V1 = Avx.Blend(t2, v, 0x33);
 
                 Vector256<float> t4 = Avx.UnpackLow(r4, r5);
                 Vector256<float> t6 = Avx.UnpackLow(r6, r7);
                 v = Avx.Shuffle(t4, t6, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V4L) = Avx.Blend(t4, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V5L) = Avx.Blend(t6, v, 0x33);
+                d.V4 = Avx.Blend(t4, v, 0xCC);
+                d.V5 = Avx.Blend(t6, v, 0x33);
 
                 Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
                 Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
                 v = Avx.Shuffle(t1, t3, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V2L) = Avx.Blend(t1, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V3L) = Avx.Blend(t3, v, 0x33);
+                d.V2 = Avx.Blend(t1, v, 0xCC);
+                d.V3 = Avx.Blend(t3, v, 0x33);
 
                 Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
                 Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
                 v = Avx.Shuffle(t5, t7, 0x4E);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V6L) = Avx.Blend(t5, v, 0xCC);
-                Unsafe.As<Vector4, Vector256<float>>(ref d.V7L) = Avx.Blend(t7, v, 0x33);
+                d.V6 = Avx.Blend(t5, v, 0xCC);
+                d.V7 = Avx.Blend(t7, v, 0x33);
             }
             else
 #endif
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
index bc2c7634b5..bc6c8c6cc7 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
@@ -44,7 +44,7 @@ public HuffmanLut(HuffmanSpec spec)
                 }
             }
 
-            this.Values = new uint[maxValue + 1];
+            this.Values = new int[maxValue + 1];
 
             int code = 0;
             int k = 0;
@@ -54,7 +54,7 @@ public HuffmanLut(HuffmanSpec spec)
                 int bits = (i + 1) << 24;
                 for (int j = 0; j < spec.Count[i]; j++)
                 {
-                    this.Values[spec.Values[k]] = (uint)(bits | code);
+                    this.Values[spec.Values[k]] = bits | code;
                     code++;
                     k++;
                 }
@@ -66,6 +66,6 @@ public HuffmanLut(HuffmanSpec spec)
         /// <summary>
         /// Gets the collection of huffman values.
         /// </summary>
-        public uint[] Values { get; }
+        public int[] Values { get; }
     }
-}
\ No newline at end of file
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
new file mode 100644
index 0000000000..ca352397b8
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs
@@ -0,0 +1,392 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System.IO;
+using System.Runtime.CompilerServices;
+using System.Threading;
+using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+    internal class HuffmanScanEncoder
+    {
+        /// <summary>
+        /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count).
+        /// </summary>
+        /// <remarks>
+        /// This is subject to change, 1024 seems to be the best value in terms of performance.
+        /// <see cref="Emit(int, int)"/> expects it to be at least 8 (see comments in method body).
+        /// </remarks>
+        private const int EmitBufferSizeInBytes = 1024;
+
+        /// <summary>
+        /// A buffer for reducing the number of stream writes when emitting Huffman tables.
+        /// </summary>
+        private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes];
+
+        /// <summary>
+        /// Number of filled bytes in <see cref="emitBuffer"/> buffer
+        /// </summary>
+        private int emitLen = 0;
+
+        /// <summary>
+        /// Emmited bits 'micro buffer' before being transfered to the <see cref="emitBuffer"/>.
+        /// </summary>
+        private int accumulatedBits;
+
+        /// <summary>
+        /// Number of jagged bits stored in <see cref="accumulatedBits"/>
+        /// </summary>
+        private int bitCount;
+
+        private Block8x8F temporalBlock1;
+        private Block8x8F temporalBlock2;
+
+        /// <summary>
+        /// The output stream. All attempted writes after the first error become no-ops.
+        /// </summary>
+        private readonly Stream target;
+
+        public HuffmanScanEncoder(Stream outputStream)
+        {
+            this.target = outputStream;
+        }
+
+        /// <summary>
+        /// Encodes the image with no subsampling.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
+        /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
+        public void Encode444<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            var unzig = ZigZag.CreateUnzigTable();
+
+            // ReSharper disable once InconsistentNaming
+            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
+
+            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
+            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
+            RowOctet<TPixel> currentRows = default;
+
+            var pixelConverter = new YCbCrForwardConverter444<TPixel>(frame);
+
+            for (int y = 0; y < pixels.Height; y += 8)
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                currentRows.Update(pixelBuffer, y);
+
+                for (int x = 0; x < pixels.Width; x += 8)
+                {
+                    pixelConverter.Convert(x, y, ref currentRows);
+
+                    prevDCY = this.WriteBlock(
+                        QuantIndex.Luminance,
+                        prevDCY,
+                        ref pixelConverter.Y,
+                        ref luminanceQuantTable,
+                        ref unzig);
+
+                    prevDCCb = this.WriteBlock(
+                        QuantIndex.Chrominance,
+                        prevDCCb,
+                        ref pixelConverter.Cb,
+                        ref chrominanceQuantTable,
+                        ref unzig);
+
+                    prevDCCr = this.WriteBlock(
+                        QuantIndex.Chrominance,
+                        prevDCCr,
+                        ref pixelConverter.Cr,
+                        ref chrominanceQuantTable,
+                        ref unzig);
+                }
+            }
+
+            this.FlushInternalBuffer();
+        }
+
+        /// <summary>
+        /// Encodes the image with subsampling. The Cb and Cr components are each subsampled
+        /// at a factor of 2 both horizontally and vertically.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
+        /// <param name="chrominanceQuantTable">Chrominance quantization table provided by the callee</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
+        public void Encode420<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            var unzig = ZigZag.CreateUnzigTable();
+
+            // ReSharper disable once InconsistentNaming
+            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
+            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
+            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
+            RowOctet<TPixel> currentRows = default;
+
+            var pixelConverter = new YCbCrForwardConverter420<TPixel>(frame);
+
+            for (int y = 0; y < pixels.Height; y += 16)
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                for (int x = 0; x < pixels.Width; x += 16)
+                {
+                    for (int i = 0; i < 2; i++)
+                    {
+                        int yOff = i * 8;
+                        currentRows.Update(pixelBuffer, y + yOff);
+                        pixelConverter.Convert(x, y, ref currentRows, i);
+
+                        prevDCY = this.WriteBlock(
+                            QuantIndex.Luminance,
+                            prevDCY,
+                            ref pixelConverter.YLeft,
+                            ref luminanceQuantTable,
+                            ref unzig);
+
+                        prevDCY = this.WriteBlock(
+                            QuantIndex.Luminance,
+                            prevDCY,
+                            ref pixelConverter.YRight,
+                            ref luminanceQuantTable,
+                            ref unzig);
+                    }
+
+                    prevDCCb = this.WriteBlock(
+                        QuantIndex.Chrominance,
+                        prevDCCb,
+                        ref pixelConverter.Cb,
+                        ref chrominanceQuantTable,
+                        ref unzig);
+
+                    prevDCCr = this.WriteBlock(
+                        QuantIndex.Chrominance,
+                        prevDCCr,
+                        ref pixelConverter.Cr,
+                        ref chrominanceQuantTable,
+                        ref unzig);
+                }
+            }
+
+            this.FlushInternalBuffer();
+        }
+
+        /// <summary>
+        /// Encodes the image with no chroma, just luminance.
+        /// </summary>
+        /// <typeparam name="TPixel">The pixel format.</typeparam>
+        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
+        /// <param name="luminanceQuantTable">Luminance quantization table provided by the callee</param>
+        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
+        public void EncodeGrayscale<TPixel>(Image<TPixel> pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken)
+            where TPixel : unmanaged, IPixel<TPixel>
+        {
+            var unzig = ZigZag.CreateUnzigTable();
+
+            // ReSharper disable once InconsistentNaming
+            int prevDCY = 0;
+
+            var pixelConverter = LuminanceForwardConverter<TPixel>.Create();
+            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
+            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
+            RowOctet<TPixel> currentRows = default;
+
+            for (int y = 0; y < pixels.Height; y += 8)
+            {
+                cancellationToken.ThrowIfCancellationRequested();
+                currentRows.Update(pixelBuffer, y);
+
+                for (int x = 0; x < pixels.Width; x += 8)
+                {
+                    pixelConverter.Convert(frame, x, y, ref currentRows);
+
+                    prevDCY = this.WriteBlock(
+                        QuantIndex.Luminance,
+                        prevDCY,
+                        ref pixelConverter.Y,
+                        ref luminanceQuantTable,
+                        ref unzig);
+                }
+            }
+
+            this.FlushInternalBuffer();
+        }
+
+        /// <summary>
+        /// Writes a block of pixel data using the given quantization table,
+        /// returning the post-quantized DC value of the DCT-transformed block.
+        /// The block is in natural (not zig-zag) order.
+        /// </summary>
+        /// <param name="index">The quantization table index.</param>
+        /// <param name="prevDC">The previous DC value.</param>
+        /// <param name="src">Source block</param>
+        /// <param name="quant">Quantization table</param>
+        /// <param name="unZig">The 8x8 Unzig block.</param>
+        /// <returns>The <see cref="int"/>.</returns>
+        private int WriteBlock(
+            QuantIndex index,
+            int prevDC,
+            ref Block8x8F src,
+            ref Block8x8F quant,
+            ref ZigZag unZig)
+        {
+            ref Block8x8F refTemp1 = ref this.temporalBlock1;
+            ref Block8x8F refTemp2 = ref this.temporalBlock2;
+
+            FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2);
+
+            Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig);
+
+            int dc = (int)refTemp2[0];
+
+            // Emit the DC delta.
+            this.EmitHuffRLE((2 * (int)index) + 0, 0, dc - prevDC);
+
+            // Emit the AC components.
+            int h = (2 * (int)index) + 1;
+            int runLength = 0;
+
+            for (int zig = 1; zig < Block8x8F.Size; zig++)
+            {
+                int ac = (int)refTemp2[zig];
+
+                if (ac == 0)
+                {
+                    runLength++;
+                }
+                else
+                {
+                    while (runLength > 15)
+                    {
+                        this.EmitHuff(h, 0xf0);
+                        runLength -= 16;
+                    }
+
+                    this.EmitHuffRLE(h, runLength, ac);
+                    runLength = 0;
+                }
+            }
+
+            if (runLength > 0)
+            {
+                this.EmitHuff(h, 0x00);
+            }
+
+            return dc;
+        }
+
+        /// <summary>
+        /// Emits the least significant count of bits to the stream write buffer.
+        /// The precondition is bits
+        /// <example>
+        /// &lt; 1&lt;&lt;nBits &amp;&amp; nBits &lt;= 16
+        /// </example>
+        /// .
+        /// </summary>
+        /// <param name="bits">The packed bits.</param>
+        /// <param name="count">The number of bits</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void Emit(int bits, int count)
+        {
+            count += this.bitCount;
+            bits <<= 32 - count;
+            bits |= this.accumulatedBits;
+
+            // Only write if more than 8 bits.
+            if (count >= 8)
+            {
+                // Track length
+                while (count >= 8)
+                {
+                    byte b = (byte)(bits >> 24);
+                    this.emitBuffer[this.emitLen++] = b;
+                    if (b == byte.MaxValue)
+                    {
+                        this.emitBuffer[this.emitLen++] = byte.MinValue;
+                    }
+
+                    bits <<= 8;
+                    count -= 8;
+                }
+
+                // This can emit 4 times of:
+                // 1 byte guaranteed
+                // 1 extra byte.MinValue byte if previous one was byte.MaxValue
+                // Thus writing (1 + 1) * 4 = 8 bytes max
+                // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write
+                if (this.emitLen > EmitBufferSizeInBytes - 8)
+                {
+                    this.target.Write(this.emitBuffer, 0, this.emitLen);
+                    this.emitLen = 0;
+                }
+            }
+
+            this.accumulatedBits = bits;
+            this.bitCount = count;
+        }
+
+        /// <summary>
+        /// Emits the given value with the given Huffman encoder.
+        /// </summary>
+        /// <param name="index">The index of the Huffman encoder</param>
+        /// <param name="value">The value to encode.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void EmitHuff(int index, int value)
+        {
+            int x = HuffmanLut.TheHuffmanLut[index].Values[value];
+            this.Emit(x & ((1 << 24) - 1), x >> 24);
+        }
+
+        /// <summary>
+        /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
+        /// </summary>
+        /// <param name="index">The index of the Huffman encoder</param>
+        /// <param name="runLength">The number of copies to encode.</param>
+        /// <param name="value">The value to encode.</param>
+        [MethodImpl(InliningOptions.ShortMethod)]
+        private void EmitHuffRLE(int index, int runLength, int value)
+        {
+            int a = value;
+            int b = value;
+            if (a < 0)
+            {
+                a = -value;
+                b = value - 1;
+            }
+
+            int bt = Numerics.MinimumBitsToStore16((uint)a);
+
+            this.EmitHuff(index, (runLength << 4) | bt);
+            if (bt > 0)
+            {
+                this.Emit(b & ((1 << bt) - 1), bt);
+            }
+        }
+
+        /// <summary>
+        /// Writes remaining bytes from internal buffer to the target stream.
+        /// </summary>
+        /// <remarks>Pads last byte with 1's if necessary</remarks>
+        private void FlushInternalBuffer()
+        {
+            // pad last byte with 1's
+            int padBitsCount = 8 - (this.bitCount % 8);
+            if (padBitsCount != 0)
+            {
+                this.Emit((1 << padBitsCount) - 1, padBitsCount);
+            }
+
+            // flush remaining bytes
+            if (this.emitLen != 0)
+            {
+                this.target.Write(this.emitBuffer, 0, this.emitLen);
+            }
+        }
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
index cc81130dd7..fc5b9a8682 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs
@@ -49,7 +49,7 @@ public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel>
             ref Block8x8F yBlock = ref this.Y;
             ref L8 l8Start = ref l8Span[0];
 
-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
                 ref L8 c = ref Unsafe.Add(ref l8Start, i);
                 yBlock[i] = c.PackedValue;
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
index 3c1a02c5aa..15574a32a2 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs
@@ -92,48 +92,144 @@ public static RgbToYCbCrConverterLut Create()
             return tables;
         }
 
-        /// <summary>
-        /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values.
-        /// </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void ConvertPixelInto(
-            int r,
-            int g,
-            int b,
-            ref Block8x8F yResult,
-            ref Block8x8F cbResult,
-            ref Block8x8F crResult,
-            int i)
+        private float CalculateY(byte r, byte g, byte b)
         {
             // float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
-            yResult[i] = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
+            return (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits;
+        }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateCb(byte r, byte g, byte b)
+        {
             // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
-            cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
+            return (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits;
+        }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateCr(byte r, byte g, byte b)
+        {
             // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
-            crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
+            return (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits;
         }
 
-        public void Convert(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        /// <summary>
+        /// Converts Rgb24 pixels into YCbCr color space with 4:4:4 subsampling sampling of luminance and chroma.
+        /// </summary>
+        /// <param name="rgbSpan">Span of Rgb24 pixel data</param>
+        /// <param name="yBlock">Resulting Y values block</param>
+        /// <param name="cbBlock">Resulting Cb values block</param>
+        /// <param name="crBlock">Resulting Cr values block</param>
+        public void Convert444(Span<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
         {
             ref Rgb24 rgbStart = ref rgbSpan[0];
 
-            for (int i = 0; i < 64; i++)
+            for (int i = 0; i < Block8x8F.Size; i++)
             {
-                ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i);
-
-                this.ConvertPixelInto(
-                    c.R,
-                    c.G,
-                    c.B,
-                    ref yBlock,
-                    ref cbBlock,
-                    ref crBlock,
-                    i);
+                Rgb24 c = Unsafe.Add(ref rgbStart, i);
+
+                yBlock[i] = this.CalculateY(c.R, c.G, c.B);
+                cbBlock[i] = this.CalculateCb(c.R, c.G, c.B);
+                crBlock[i] = this.CalculateCr(c.R, c.G, c.B);
             }
         }
 
+        /// <summary>
+        /// Converts Rgb24 pixels into YCbCr color space with 4:2:0 subsampling of luminance and chroma.
+        /// </summary>
+        /// <remarks>Calculates 2 out of 4 luminance blocks and half of chroma blocks. This method must be called twice per 4x 8x8 DCT blocks with different row param.</remarks>
+        /// <param name="rgbSpan">Span of Rgb24 pixel data</param>
+        /// <param name="yBlockLeft">First or "left" resulting Y block</param>
+        /// <param name="yBlockRight">Second or "right" resulting Y block</param>
+        /// <param name="cbBlock">Resulting Cb values block</param>
+        /// <param name="crBlock">Resulting Cr values block</param>
+        /// <param name="row">Row index of the 16x16 block, 0 or 1</param>
+        public void Convert420(Span<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        {
+            DebugGuard.MustBeBetweenOrEqualTo(row, 0, 1, nameof(row));
+
+            ref float yBlockLeftRef = ref Unsafe.As<Block8x8F, float>(ref yBlockLeft);
+            ref float yBlockRightRef = ref Unsafe.As<Block8x8F, float>(ref yBlockRight);
+
+            // 0-31 or 32-63
+            // upper or lower part
+            int chromaWriteOffset = row * (Block8x8F.Size / 2);
+            ref float cbBlockRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, float>(ref cbBlock), chromaWriteOffset);
+            ref float crBlockRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, float>(ref crBlock), chromaWriteOffset);
+
+            ref Rgb24 rgbStart = ref rgbSpan[0];
+
+            for (int i = 0; i < 8; i += 2)
+            {
+                int yBlockWriteOffset = i * 8;
+                ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, i * 16);
+
+                int chromaOffset = 8 * (i / 2);
+
+                // left
+                this.ConvertChunk420(
+                    ref stride,
+                    ref Unsafe.Add(ref yBlockLeftRef, yBlockWriteOffset),
+                    ref Unsafe.Add(ref cbBlockRef, chromaOffset),
+                    ref Unsafe.Add(ref crBlockRef, chromaOffset));
+
+                // right
+                this.ConvertChunk420(
+                    ref Unsafe.Add(ref stride, 8),
+                    ref Unsafe.Add(ref yBlockRightRef, yBlockWriteOffset),
+                    ref Unsafe.Add(ref cbBlockRef, chromaOffset + 4),
+                    ref Unsafe.Add(ref crBlockRef, chromaOffset + 4));
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, ref float cbBlock, ref float crBlock)
+        {
+            // jpeg 8x8 blocks are processed as 16x16 blocks with 16x8 subpasses (this is done for performance reasons)
+            // each row is 16 pixels wide thus +16 stride reference offset
+            // resulting luminance (Y`) are sampled at original resolution thus +8 reference offset
+            for (int k = 0; k < 8; k += 2)
+            {
+                ref float yBlockRef = ref Unsafe.Add(ref yBlock, k);
+
+                // top row
+                Rgb24 px0 = Unsafe.Add(ref stride, k);
+                Rgb24 px1 = Unsafe.Add(ref stride, k + 1);
+                yBlockRef = this.CalculateY(px0.R, px0.G, px0.B);
+                Unsafe.Add(ref yBlockRef, 1) = this.CalculateY(px1.R, px1.G, px1.B);
+
+                // bottom row
+                Rgb24 px2 = Unsafe.Add(ref stride, k + 16);
+                Rgb24 px3 = Unsafe.Add(ref stride, k + 17);
+                Unsafe.Add(ref yBlockRef, 8) = this.CalculateY(px2.R, px2.G, px2.B);
+                Unsafe.Add(ref yBlockRef, 9) = this.CalculateY(px3.R, px3.G, px3.B);
+
+                // chroma average for 2x2 pixel block
+                Unsafe.Add(ref cbBlock, k / 2) = this.CalculateAverageCb(px0, px1, px2, px3);
+                Unsafe.Add(ref crBlock, k / 2) = this.CalculateAverageCr(px0, px1, px2, px3);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateAverageCb(Rgb24 px0, Rgb24 px1, Rgb24 px2, Rgb24 px3)
+        {
+            return 0.25f
+                * (this.CalculateCb(px0.R, px0.G, px0.B)
+                + this.CalculateCb(px1.R, px1.G, px1.B)
+                + this.CalculateCb(px2.R, px2.G, px2.B)
+                + this.CalculateCb(px3.R, px3.G, px3.B));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private float CalculateAverageCr(Rgb24 px0, Rgb24 px1, Rgb24 px2, Rgb24 px3)
+        {
+            return 0.25f
+                * (this.CalculateCr(px0.R, px0.G, px0.B)
+                + this.CalculateCr(px1.R, px1.G, px1.B)
+                + this.CalculateCr(px2.R, px2.G, px2.B)
+                + this.CalculateCr(px3.R, px3.G, px3.B));
+        }
+
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static int Fix(float x)
             => (int)((x * (1L << ScaleBits)) + 0.5F);
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
index 209cc3c6ab..926e7d5a4a 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs
@@ -1,4 +1,4 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 using System;
@@ -27,19 +27,45 @@ public static bool IsSupported
             }
         }
 
+        public static int AvxCompatibilityPadding
+        {
+            // rgb byte matrices contain 8 strides by 8 pixels each, thus 64 pixels total
+            // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes
+            // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits
+            // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride:
+            // stride 0    0    - 192  -(+64bits)-> 256
+            // stride 1    192  - 384  -(+64bits)-> 448
+            // stride 2    384  - 576  -(+64bits)-> 640
+            // stride 3    576  - 768  -(+64bits)-> 832
+            // stride 4    768  - 960  -(+64bits)-> 1024
+            // stride 5    960  - 1152 -(+64bits)-> 1216
+            // stride 6    1152 - 1344 -(+64bits)-> 1408
+            // stride 7    1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION
+            //
+            // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits
+            // This is not permitted - we are reading foreign memory
+            //
+            // 8 byte padding to rgb byte span will solve this problem without extra code in converters
+            get
+            {
+#if SUPPORTS_RUNTIME_INTRINSICS
+                if (IsSupported)
+                {
+                    return 8;
+                }
+#endif
+                return 0;
+            }
+        }
+
 #if SUPPORTS_RUNTIME_INTRINSICS
+
         private static ReadOnlySpan<byte> MoveFirst24BytesToSeparateLanes => new byte[]
         {
             0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0,
             3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0
         };
 
-        private static ReadOnlySpan<byte> MoveLast24BytesToSeparateLanes => new byte[]
-        {
-            2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0,
-            5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0
-        };
-
         private static ReadOnlySpan<byte> ExtractRgb => new byte[]
         {
             0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF,
@@ -47,7 +73,15 @@ public static bool IsSupported
         };
 #endif
 
-        public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
+        /// <summary>
+        /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:4:4 subsampling
+        /// </summary>
+        /// <remarks>Total size of rgb span must be 200 bytes</remarks>
+        /// <param name="rgbSpan">Span of rgb pixels with size of 64</param>
+        /// <param name="yBlock">8x8 destination matrix of Luminance(Y) converted data</param>
+        /// <param name="cbBlock">8x8 destination matrix of Chrominance(Cb) converted data</param>
+        /// <param name="crBlock">8x8 destination matrix of Chrominance(Cr) converted data</param>
+        public static void Convert444(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock)
         {
             Debug.Assert(IsSupported, "AVX2 is required to run this converter");
 
@@ -63,18 +97,20 @@ public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, re
             var f05 = Vector256.Create(0.5f);
             var zero = Vector256.Create(0).AsByte();
 
-            ref Vector256<byte> inRef = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
-            ref Vector256<float> destYRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref yBlock);
-            ref Vector256<float> destCbRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock);
-            ref Vector256<float> destCrRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock);
+            ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
+            ref Vector256<float> destYRef = ref yBlock.V0;
+            ref Vector256<float> destCbRef = ref cbBlock.V0;
+            ref Vector256<float> destCrRef = ref crBlock.V0;
 
             var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
             var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
             Vector256<byte> rgb, rg, bx;
             Vector256<float> r, g, b;
-            for (int i = 0; i < 7; i++)
+
+            const int bytesPerRgbStride = 24;
+            for (int i = 0; i < 8; i++)
             {
-                rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte();
+                rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * i)).AsUInt32(), extractToLanesMask).AsByte();
 
                 rgb = Avx2.Shuffle(rgb, extractRgbMask);
 
@@ -94,27 +130,130 @@ public static void Convert(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlock, re
                 // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
                 Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
             }
+#endif
+        }
+
+        /// <summary>
+        /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling
+        /// </summary>
+        public static void Convert420(ReadOnlySpan<Rgb24> rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row)
+        {
+            Debug.Assert(IsSupported, "AVX2 is required to run this converter");
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            var f0299 = Vector256.Create(0.299f);
+            var f0587 = Vector256.Create(0.587f);
+            var f0114 = Vector256.Create(0.114f);
+            var fn0168736 = Vector256.Create(-0.168736f);
+            var fn0331264 = Vector256.Create(-0.331264f);
+            var f128 = Vector256.Create(128f);
+            var fn0418688 = Vector256.Create(-0.418688f);
+            var fn0081312F = Vector256.Create(-0.081312F);
+            var f05 = Vector256.Create(0.5f);
+            var zero = Vector256.Create(0).AsByte();
+
+            ref Vector256<byte> rgbByteSpan = ref Unsafe.As<Rgb24, Vector256<byte>>(ref MemoryMarshal.GetReference(rgbSpan));
+
+            int destOffset = row * 4;
 
-            extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes));
-            rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte();
-            rgb = Avx2.Shuffle(rgb, extractRgbMask);
+            ref Vector256<float> destCbRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref cbBlock), destOffset);
+            ref Vector256<float> destCrRef = ref Unsafe.Add(ref Unsafe.As<Block8x8F, Vector256<float>>(ref crBlock), destOffset);
 
-            rg = Avx2.UnpackLow(rgb, zero);
-            bx = Avx2.UnpackHigh(rgb, zero);
+            var extractToLanesMask = Unsafe.As<byte, Vector256<uint>>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes));
+            var extractRgbMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ExtractRgb));
+            Vector256<byte> rgb, rg, bx;
+            Vector256<float> r, g, b;
+
+            Span<Vector256<float>> rDataLanes = stackalloc Vector256<float>[4];
+            Span<Vector256<float>> gDataLanes = stackalloc Vector256<float>[4];
+            Span<Vector256<float>> bDataLanes = stackalloc Vector256<float>[4];
+
+            const int bytesPerRgbStride = 24;
+            for (int i = 0; i < 4; i++)
+            {
+                // 16x2 => 8x1
+                // left 8x8 column conversions
+                for (int j = 0; j < 4; j += 2)
+                {
+                    rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
+
+                    rgb = Avx2.Shuffle(rgb, extractRgbMask);
+
+                    rg = Avx2.UnpackLow(rgb, zero);
+                    bx = Avx2.UnpackHigh(rgb, zero);
+
+                    r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+                    g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+                    b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
+
+                    int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
+
+                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                    Unsafe.Add(ref yBlockLeft.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+
+                    rDataLanes[j] = r;
+                    gDataLanes[j] = g;
+                    bDataLanes[j] = b;
+                }
+
+                // 16x2 => 8x1
+                // right 8x8 column conversions
+                for (int j = 1; j < 4; j += 2)
+                {
+                    rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte();
+
+                    rgb = Avx2.Shuffle(rgb, extractRgbMask);
+
+                    rg = Avx2.UnpackLow(rgb, zero);
+                    bx = Avx2.UnpackHigh(rgb, zero);
 
-            r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
-            g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
-            b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
+                    r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32());
+                    g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32());
+                    b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32());
 
-            // (0.299F * r) + (0.587F * g) + (0.114F * b);
-            Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
+                    int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1);
 
-            // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
-            Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
+                    // (0.299F * r) + (0.587F * g) + (0.114F * b);
+                    Unsafe.Add(ref yBlockRight.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r);
 
-            // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
-            Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+                    rDataLanes[j] = r;
+                    gDataLanes[j] = g;
+                    bDataLanes[j] = b;
+                }
+
+                r = Scale16x2_8x1(rDataLanes);
+                g = Scale16x2_8x1(gDataLanes);
+                b = Scale16x2_8x1(bDataLanes);
+
+                // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b))
+                Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r));
+
+                // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b))
+                Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r));
+            }
 #endif
         }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// Scales 16x2 matrix to 8x1 using 2x2 average
+        /// </summary>
+        /// <param name="v">Input matrix consisting of 4 256bit vectors</param>
+        /// <returns>256bit vector containing upper and lower scaled parts of the input matrix</returns>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static Vector256<float> Scale16x2_8x1(ReadOnlySpan<Vector256<float>> v)
+        {
+            Debug.Assert(Avx2.IsSupported, "AVX2 is required to run this converter");
+            DebugGuard.IsTrue(v.Length == 4, "Input span must consist of 4 elements");
+
+            var f025 = Vector256.Create(0.25f);
+
+            Vector256<float> left = Avx.Add(v[0], v[2]);
+            Vector256<float> right = Avx.Add(v[1], v[3]);
+            Vector256<float> avg2x2 = Avx.Multiply(Avx.HorizontalAdd(left, right), f025);
+
+            return Avx2.Permute4x64(avg2x2.AsDouble(), 0b11_01_10_00).AsSingle();
+        }
+#endif
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
new file mode 100644
index 0000000000..a4abd532b3
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs
@@ -0,0 +1,121 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+    /// <summary>
+    /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
+    /// </summary>
+    /// <typeparam name="TPixel">The pixel type to work on</typeparam>
+    internal ref struct YCbCrForwardConverter420<TPixel>
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        /// <summary>
+        /// Number of pixels processed per single <see cref="Convert(int, int, ref RowOctet{TPixel}, int)"/> call
+        /// </summary>
+        private const int PixelsPerSample = 16 * 8;
+
+        /// <summary>
+        /// Total byte size of processed pixels converted from TPixel to <see cref="Rgb24"/>
+        /// </summary>
+        private const int RgbSpanByteSize = PixelsPerSample * 3;
+
+        /// <summary>
+        /// <see cref="Size"/> of sampling area from given frame pixel buffer
+        /// </summary>
+        private static readonly Size SampleSize = new Size(16, 8);
+
+        /// <summary>
+        /// The left Y component
+        /// </summary>
+        public Block8x8F YLeft;
+
+        /// <summary>
+        /// The left Y component
+        /// </summary>
+        public Block8x8F YRight;
+
+        /// <summary>
+        /// The Cb component
+        /// </summary>
+        public Block8x8F Cb;
+
+        /// <summary>
+        /// The Cr component
+        /// </summary>
+        public Block8x8F Cr;
+
+        /// <summary>
+        /// The color conversion tables
+        /// </summary>
+        private RgbToYCbCrConverterLut colorTables;
+
+        /// <summary>
+        /// Temporal 16x8 block to hold TPixel data
+        /// </summary>
+        private Span<TPixel> pixelSpan;
+
+        /// <summary>
+        /// Temporal RGB block
+        /// </summary>
+        private Span<Rgb24> rgbSpan;
+
+        /// <summary>
+        /// Sampled pixel buffer size
+        /// </summary>
+        private Size samplingAreaSize;
+
+        /// <summary>
+        /// <see cref="Configuration"/> for internal operations
+        /// </summary>
+        private Configuration config;
+
+        public YCbCrForwardConverter420(ImageFrame<TPixel> frame)
+        {
+            // matrices would be filled during convert calls
+            this.YLeft = default;
+            this.YRight = default;
+            this.Cb = default;
+            this.Cr = default;
+
+            // temporal pixel buffers
+            this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
+
+            // frame data
+            this.samplingAreaSize = new Size(frame.Width, frame.Height);
+            this.config = frame.GetConfiguration();
+
+            // conversion vector fallback data
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                this.colorTables = RgbToYCbCrConverterLut.Create();
+            }
+            else
+            {
+                this.colorTables = default;
+            }
+        }
+
+        public void Convert(int x, int y, ref RowOctet<TPixel> currentRows, int idx)
+        {
+            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
+
+            PixelOperations<TPixel>.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
+
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
+            }
+            else
+            {
+                this.colorTables.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx);
+            }
+        }
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
new file mode 100644
index 0000000000..ef589272bd
--- /dev/null
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs
@@ -0,0 +1,122 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
+{
+    /// <summary>
+    /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
+    /// </summary>
+    /// <typeparam name="TPixel">The pixel type to work on</typeparam>
+    internal ref struct YCbCrForwardConverter444<TPixel>
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        /// <summary>
+        /// Number of pixels processed per single <see cref="Convert(int, int, ref RowOctet{TPixel})"/> call
+        /// </summary>
+        private const int PixelsPerSample = 8 * 8;
+
+        /// <summary>
+        /// Total byte size of processed pixels converted from TPixel to <see cref="Rgb24"/>
+        /// </summary>
+        private const int RgbSpanByteSize = PixelsPerSample * 3;
+
+        /// <summary>
+        /// <see cref="Size"/> of sampling area from given frame pixel buffer
+        /// </summary>
+        private static readonly Size SampleSize = new Size(8, 8);
+
+        /// <summary>
+        /// The Y component
+        /// </summary>
+        public Block8x8F Y;
+
+        /// <summary>
+        /// The Cb component
+        /// </summary>
+        public Block8x8F Cb;
+
+        /// <summary>
+        /// The Cr component
+        /// </summary>
+        public Block8x8F Cr;
+
+        /// <summary>
+        /// The color conversion tables
+        /// </summary>
+        private RgbToYCbCrConverterLut colorTables;
+
+        /// <summary>
+        /// Temporal 64-byte span to hold unconverted TPixel data
+        /// </summary>
+        private Span<TPixel> pixelSpan;
+
+        /// <summary>
+        /// Temporal 64-byte span to hold converted Rgb24 data
+        /// </summary>
+        private Span<Rgb24> rgbSpan;
+
+        /// <summary>
+        /// Sampled pixel buffer size
+        /// </summary>
+        private Size samplingAreaSize;
+
+        /// <summary>
+        /// <see cref="Configuration"/> for internal operations
+        /// </summary>
+        private Configuration config;
+
+        public YCbCrForwardConverter444(ImageFrame<TPixel> frame)
+        {
+            // matrices would be filled during convert calls
+            this.Y = default;
+            this.Cb = default;
+            this.Cr = default;
+
+            // temporal pixel buffers
+            this.pixelSpan = new TPixel[PixelsPerSample].AsSpan();
+            this.rgbSpan = MemoryMarshal.Cast<byte, Rgb24>(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan());
+
+            // frame data
+            this.samplingAreaSize = new Size(frame.Width, frame.Height);
+            this.config = frame.GetConfiguration();
+
+            // conversion vector fallback data
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                this.colorTables = RgbToYCbCrConverterLut.Create();
+            }
+            else
+            {
+                this.colorTables = default;
+            }
+        }
+
+        /// <summary>
+        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
+        /// </summary>
+        public void Convert(int x, int y, ref RowOctet<TPixel> currentRows)
+        {
+            YCbCrForwardConverter<TPixel>.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize);
+
+            PixelOperations<TPixel>.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan);
+
+            ref Block8x8F yBlock = ref this.Y;
+            ref Block8x8F cbBlock = ref this.Cb;
+            ref Block8x8F crBlock = ref this.Cr;
+
+            if (RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                RgbToYCbCrConverterVectorized.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+            }
+            else
+            {
+                this.colorTables.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+            }
+        }
+    }
+}
diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
index 81e64b277b..f5ef770914 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs
@@ -2,81 +2,59 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-using SixLabors.ImageSharp.Advanced;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.PixelFormats;
 
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder
 {
-    /// <summary>
-    /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks.
-    /// </summary>
-    /// <typeparam name="TPixel">The pixel type to work on</typeparam>
-    internal ref struct YCbCrForwardConverter<TPixel>
+    internal static class YCbCrForwardConverter<TPixel>
         where TPixel : unmanaged, IPixel<TPixel>
     {
-        /// <summary>
-        /// The Y component
-        /// </summary>
-        public Block8x8F Y;
-
-        /// <summary>
-        /// The Cb component
-        /// </summary>
-        public Block8x8F Cb;
-
-        /// <summary>
-        /// The Cr component
-        /// </summary>
-        public Block8x8F Cr;
+        public static void LoadAndStretchEdges(RowOctet<TPixel> source, Span<TPixel> dest, Point start, Size sampleSize, Size totalSize)
+        {
+            DebugGuard.MustBeBetweenOrEqualTo(start.X, 1, totalSize.Width - 1, nameof(start.X));
+            DebugGuard.MustBeBetweenOrEqualTo(start.Y, 1, totalSize.Height - 1, nameof(start.Y));
 
-        /// <summary>
-        /// The color conversion tables
-        /// </summary>
-        private RgbToYCbCrConverterLut colorTables;
+            int width = Math.Min(sampleSize.Width, totalSize.Width - start.X);
+            int height = Math.Min(sampleSize.Height, totalSize.Height - start.Y);
 
-        /// <summary>
-        /// Temporal 8x8 block to hold TPixel data
-        /// </summary>
-        private GenericBlock8x8<TPixel> pixelBlock;
+            uint byteWidth = (uint)(width * Unsafe.SizeOf<TPixel>());
+            int remainderXCount = sampleSize.Width - width;
 
-        /// <summary>
-        /// Temporal RGB block
-        /// </summary>
-        private GenericBlock8x8<Rgb24> rgbBlock;
+            ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast<TPixel, byte>(dest));
+            int rowSizeInBytes = sampleSize.Width * Unsafe.SizeOf<TPixel>();
 
-        public static YCbCrForwardConverter<TPixel> Create()
-        {
-            var result = default(YCbCrForwardConverter<TPixel>);
-            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            for (int y = 0; y < height; y++)
             {
-                // Avoid creating lookup tables, when vectorized converter is supported
-                result.colorTables = RgbToYCbCrConverterLut.Create();
-            }
+                Span<TPixel> row = source[y];
 
-            return result;
-        }
+                ref byte s = ref Unsafe.As<TPixel, byte>(ref row[start.X]);
+                ref byte d = ref Unsafe.Add(ref blockStart, y * rowSizeInBytes);
 
-        /// <summary>
-        /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (<see cref="Y"/>, <see cref="Cb"/>, <see cref="Cr"/>)
-        /// </summary>
-        public void Convert(ImageFrame<TPixel> frame, int x, int y, ref RowOctet<TPixel> currentRows)
-        {
-            this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows);
+                Unsafe.CopyBlock(ref d, ref s, byteWidth);
+
+                ref TPixel last = ref Unsafe.Add(ref Unsafe.As<byte, TPixel>(ref d), width - 1);
 
-            Span<Rgb24> rgbSpan = this.rgbBlock.AsSpanUnsafe();
-            PixelOperations<TPixel>.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), rgbSpan);
+                for (int x = 1; x <= remainderXCount; x++)
+                {
+                    Unsafe.Add(ref last, x) = last;
+                }
+            }
 
-            ref Block8x8F yBlock = ref this.Y;
-            ref Block8x8F cbBlock = ref this.Cb;
-            ref Block8x8F crBlock = ref this.Cr;
+            int remainderYCount = sampleSize.Height - height;
 
-            if (RgbToYCbCrConverterVectorized.IsSupported)
+            if (remainderYCount == 0)
             {
-                RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+                return;
             }
-            else
+
+            ref byte lastRowStart = ref Unsafe.Add(ref blockStart, (height - 1) * rowSizeInBytes);
+
+            for (int y = 1; y <= remainderYCount; y++)
             {
-                this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock);
+                ref byte remStart = ref Unsafe.Add(ref lastRowStart, rowSizeInBytes * y);
+                Unsafe.CopyBlock(ref remStart, ref lastRowStart, (uint)rowSizeInBytes);
             }
         }
     }
diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
index a6d0622dd8..f31d07efca 100644
--- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs
@@ -1,8 +1,13 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
+using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components
@@ -10,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
     /// <summary>
     /// Contains inaccurate, but fast forward and inverse DCT implementations.
     /// </summary>
-    internal static class FastFloatingPointDCT
+    internal static partial class FastFloatingPointDCT
     {
 #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore
         private const float C_1_175876 = 1.175875602f;
@@ -38,147 +43,31 @@ internal static class FastFloatingPointDCT
         private const float C_0_765367 = 0.765366865f;
 
         private const float C_0_125 = 0.1250f;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector256<float> C_V_0_5411 = Vector256.Create(0.541196f);
+        private static readonly Vector256<float> C_V_1_3065 = Vector256.Create(1.306563f);
+        private static readonly Vector256<float> C_V_1_1758 = Vector256.Create(1.175876f);
+        private static readonly Vector256<float> C_V_0_7856 = Vector256.Create(0.785695f);
+        private static readonly Vector256<float> C_V_1_3870 = Vector256.Create(1.387040f);
+        private static readonly Vector256<float> C_V_0_2758 = Vector256.Create(0.275899f);
+
+        private static readonly Vector256<float> C_V_n1_9615 = Vector256.Create(-1.961570560f);
+        private static readonly Vector256<float> C_V_n0_3901 = Vector256.Create(-0.390180644f);
+        private static readonly Vector256<float> C_V_n0_8999 = Vector256.Create(-0.899976223f);
+        private static readonly Vector256<float> C_V_n2_5629 = Vector256.Create(-2.562915447f);
+        private static readonly Vector256<float> C_V_0_2986 = Vector256.Create(0.298631336f);
+        private static readonly Vector256<float> C_V_2_0531 = Vector256.Create(2.053119869f);
+        private static readonly Vector256<float> C_V_3_0727 = Vector256.Create(3.072711026f);
+        private static readonly Vector256<float> C_V_1_5013 = Vector256.Create(1.501321110f);
+        private static readonly Vector256<float> C_V_n1_8477 = Vector256.Create(-1.847759065f);
+        private static readonly Vector256<float> C_V_0_7653 = Vector256.Create(0.765366865f);
+
+        private static Vector256<float> C_V_InvSqrt2 = Vector256.Create(0.707107f);
+#endif
 #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore
         private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f);
 
-        /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
-        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
-        /// </summary>
-        /// <param name="src">Source</param>
-        /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
-        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
-        {
-            src.TransposeInto(ref temp);
-
-            IDCT8x4_LeftPart(ref temp, ref dest);
-            IDCT8x4_RightPart(ref temp, ref dest);
-
-            dest.TransposeInto(ref temp);
-
-            IDCT8x4_LeftPart(ref temp, ref dest);
-            IDCT8x4_RightPart(ref temp, ref dest);
-
-            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
-            dest.MultiplyInPlace(C_0_125);
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the left part of the block. Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">Destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1L;
-            Vector4 my7 = s.V7L;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3L;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5L;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2L;
-            Vector4 my6 = s.V6L;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0L;
-            Vector4 my4 = s.V4L;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0L = my0 + mb0;
-            d.V7L = my0 - mb0;
-            d.V1L = my1 + mb1;
-            d.V6L = my1 - mb1;
-            d.V2L = my2 + mb2;
-            d.V5L = my2 - mb2;
-            d.V3L = my3 + mb3;
-            d.V4L = my3 - mb3;
-        }
-
-        /// <summary>
-        /// Do IDCT internal operations on the right part of the block.
-        /// Original src:
-        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
-        /// </summary>
-        /// <param name="s">The source block</param>
-        /// <param name="d">The destination block</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
-        {
-            Vector4 my1 = s.V1R;
-            Vector4 my7 = s.V7R;
-            Vector4 mz0 = my1 + my7;
-
-            Vector4 my3 = s.V3R;
-            Vector4 mz2 = my3 + my7;
-            Vector4 my5 = s.V5R;
-            Vector4 mz1 = my3 + my5;
-            Vector4 mz3 = my1 + my5;
-
-            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
-
-            mz2 = (mz2 * C_1_961571) + mz4;
-            mz3 = (mz3 * C_0_390181) + mz4;
-            mz0 = mz0 * C_0_899976;
-            mz1 = mz1 * C_2_562915;
-
-            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
-            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
-            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
-            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
-
-            Vector4 my2 = s.V2R;
-            Vector4 my6 = s.V6R;
-            mz4 = (my2 + my6) * C_0_541196;
-            Vector4 my0 = s.V0R;
-            Vector4 my4 = s.V4R;
-            mz0 = my0 + my4;
-            mz1 = my0 - my4;
-
-            mz2 = mz4 + (my6 * C_1_847759);
-            mz3 = mz4 + (my2 * C_0_765367);
-
-            my0 = mz0 + mz3;
-            my3 = mz0 - mz3;
-            my1 = mz1 + mz2;
-            my2 = mz1 - mz2;
-
-            d.V0R = my0 + mb0;
-            d.V7R = my0 - mb0;
-            d.V1R = my1 + mb1;
-            d.V6R = my1 - mb1;
-            d.V2R = my2 + mb2;
-            d.V5R = my2 - mb2;
-            d.V3R = my3 + mb3;
-            d.V4R = my3 - mb3;
-        }
-
         /// <summary>
         /// Original:
         /// <see>
@@ -309,11 +198,84 @@ public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
         }
 
         /// <summary>
-        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization)
+        /// Combined operation of <see cref="FDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="FDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
+        /// using AVX commands.
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
+            Vector256<float> t0 = Avx.Add(s.V0, s.V7);
+            Vector256<float> t7 = Avx.Subtract(s.V0, s.V7);
+            Vector256<float> t1 = Avx.Add(s.V1, s.V6);
+            Vector256<float> t6 = Avx.Subtract(s.V1, s.V6);
+            Vector256<float> t2 = Avx.Add(s.V2, s.V5);
+            Vector256<float> t5 = Avx.Subtract(s.V2, s.V5);
+            Vector256<float> t3 = Avx.Add(s.V3, s.V4);
+            Vector256<float> t4 = Avx.Subtract(s.V3, s.V4);
+
+            Vector256<float> c0 = Avx.Add(t0, t3);
+            Vector256<float> c1 = Avx.Add(t1, t2);
+
+            // 0 4
+            d.V0 = Avx.Add(c0, c1);
+            d.V4 = Avx.Subtract(c0, c1);
+
+            Vector256<float> c3 = Avx.Subtract(t0, t3);
+            Vector256<float> c2 = Avx.Subtract(t1, t2);
+
+            // 2 6
+            d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065);
+            d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411);
+
+            c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856);
+            c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758);
+
+            c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6);
+            c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870);
+
+            // 3 5
+            d.V3 = Avx.Subtract(c0, c2);
+            d.V5 = Avx.Subtract(c3, c1);
+
+            c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2);
+            c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2);
+
+            // 1 7
+            d.V1 = Avx.Add(c0, c3);
+            d.V7 = Avx.Subtract(c0, c3);
+#endif
+        }
+
+        /// <summary>
+        /// Performs 8x8 matrix Forward Discrete Cosine Transform
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                FDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                FDCT8x4_LeftPart(ref s, ref d);
+                FDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Apply floating point FDCT from src into dest
         /// </summary>
         /// <param name="src">Source</param>
         /// <param name="dest">Destination</param>
-        /// <param name="temp">Temporary block provided by the caller</param>
+        /// <param name="temp">Temporary block provided by the caller for optimization</param>
         /// <param name="offsetSourceByNeg128">If true, a constant -128.0 offset is applied for all values before FDCT </param>
         public static void TransformFDCT(
             ref Block8x8F src,
@@ -327,14 +289,225 @@ public static void TransformFDCT(
                 temp.AddInPlace(-128F);
             }
 
-            FDCT8x4_LeftPart(ref temp, ref dest);
-            FDCT8x4_RightPart(ref temp, ref dest);
+            FDCT8x8(ref temp, ref dest);
 
             dest.TransposeInto(ref temp);
 
-            FDCT8x4_LeftPart(ref temp, ref dest);
-            FDCT8x4_RightPart(ref temp, ref dest);
+            FDCT8x8(ref temp, ref dest);
+
+            dest.MultiplyInPlace(C_0_125);
+        }
+
+        /// <summary>
+        /// Performs 8x8 matrix Inverse Discrete Cosine Transform
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Avx.IsSupported)
+            {
+                IDCT8x8_Avx(ref s, ref d);
+            }
+            else
+#endif
+            {
+                IDCT8x4_LeftPart(ref s, ref d);
+                IDCT8x4_RightPart(ref s, ref d);
+            }
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the left part of the block. Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">Destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1L;
+            Vector4 my7 = s.V7L;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3L;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5L;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
 
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2L;
+            Vector4 my6 = s.V6L;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0L;
+            Vector4 my4 = s.V4L;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0L = my0 + mb0;
+            d.V7L = my0 - mb0;
+            d.V1L = my1 + mb1;
+            d.V6L = my1 - mb1;
+            d.V2L = my2 + mb2;
+            d.V5L = my2 - mb2;
+            d.V3L = my3 + mb3;
+            d.V4L = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Do IDCT internal operations on the right part of the block.
+        /// Original src:
+        /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261
+        /// </summary>
+        /// <param name="s">The source block</param>
+        /// <param name="d">The destination block</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d)
+        {
+            Vector4 my1 = s.V1R;
+            Vector4 my7 = s.V7R;
+            Vector4 mz0 = my1 + my7;
+
+            Vector4 my3 = s.V3R;
+            Vector4 mz2 = my3 + my7;
+            Vector4 my5 = s.V5R;
+            Vector4 mz1 = my3 + my5;
+            Vector4 mz3 = my1 + my5;
+
+            Vector4 mz4 = (mz0 + mz1) * C_1_175876;
+
+            mz2 = (mz2 * C_1_961571) + mz4;
+            mz3 = (mz3 * C_0_390181) + mz4;
+            mz0 = mz0 * C_0_899976;
+            mz1 = mz1 * C_2_562915;
+
+            Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2;
+            Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3;
+            Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2;
+            Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3;
+
+            Vector4 my2 = s.V2R;
+            Vector4 my6 = s.V6R;
+            mz4 = (my2 + my6) * C_0_541196;
+            Vector4 my0 = s.V0R;
+            Vector4 my4 = s.V4R;
+            mz0 = my0 + my4;
+            mz1 = my0 - my4;
+
+            mz2 = mz4 + (my6 * C_1_847759);
+            mz3 = mz4 + (my2 * C_0_765367);
+
+            my0 = mz0 + mz3;
+            my3 = mz0 - mz3;
+            my1 = mz1 + mz2;
+            my2 = mz1 - mz2;
+
+            d.V0R = my0 + mb0;
+            d.V7R = my0 - mb0;
+            d.V1R = my1 + mb1;
+            d.V6R = my1 - mb1;
+            d.V2R = my2 + mb2;
+            d.V5R = my2 - mb2;
+            d.V3R = my3 + mb3;
+            d.V4R = my3 - mb3;
+        }
+
+        /// <summary>
+        /// Combined operation of <see cref="IDCT8x4_LeftPart(ref Block8x8F, ref Block8x8F)"/> and <see cref="IDCT8x4_RightPart(ref Block8x8F, ref Block8x8F)"/>
+        /// using AVX commands.
+        /// </summary>
+        /// <param name="s">Source</param>
+        /// <param name="d">Destination</param>
+        public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            Debug.Assert(Avx.IsSupported, "AVX is required to execute this method");
+
+            Vector256<float> my1 = s.V1;
+            Vector256<float> my7 = s.V7;
+            Vector256<float> mz0 = Avx.Add(my1, my7);
+
+            Vector256<float> my3 = s.V3;
+            Vector256<float> mz2 = Avx.Add(my3, my7);
+            Vector256<float> my5 = s.V5;
+            Vector256<float> mz1 = Avx.Add(my3, my5);
+            Vector256<float> mz3 = Avx.Add(my1, my5);
+
+            Vector256<float> mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758);
+
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901);
+            mz0 = Avx.Multiply(mz0, C_V_n0_8999);
+            mz1 = Avx.Multiply(mz1, C_V_n2_5629);
+
+            Vector256<float> mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2);
+            Vector256<float> mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3);
+            Vector256<float> mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2);
+            Vector256<float> mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3);
+
+            Vector256<float> my2 = s.V2;
+            Vector256<float> my6 = s.V6;
+            mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411);
+            Vector256<float> my0 = s.V0;
+            Vector256<float> my4 = s.V4;
+            mz0 = Avx.Add(my0, my4);
+            mz1 = Avx.Subtract(my0, my4);
+            mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477);
+            mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653);
+
+            my0 = Avx.Add(mz0, mz3);
+            my3 = Avx.Subtract(mz0, mz3);
+            my1 = Avx.Add(mz1, mz2);
+            my2 = Avx.Subtract(mz1, mz2);
+
+            d.V0 = Avx.Add(my0, mb0);
+            d.V7 = Avx.Subtract(my0, mb0);
+            d.V1 = Avx.Add(my1, mb1);
+            d.V6 = Avx.Subtract(my1, mb1);
+            d.V2 = Avx.Add(my2, mb2);
+            d.V5 = Avx.Subtract(my2, mb2);
+            d.V3 = Avx.Add(my3, mb3);
+            d.V4 = Avx.Subtract(my3, mb3);
+#endif
+        }
+
+        /// <summary>
+        /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization).
+        /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239
+        /// </summary>
+        /// <param name="src">Source</param>
+        /// <param name="dest">Destination</param>
+        /// <param name="temp">Temporary block provided by the caller</param>
+        public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp)
+        {
+            src.TransposeInto(ref temp);
+
+            IDCT8x8(ref temp, ref dest);
+            dest.TransposeInto(ref temp);
+            IDCT8x8(ref temp, ref dest);
+
+            // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing?
             dest.MultiplyInPlace(C_0_125);
         }
     }
diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
index f5dc1c79fe..6020e6196c 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs
@@ -5,14 +5,11 @@
 using System.Buffers.Binary;
 using System.IO;
 using System.Linq;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 using System.Threading;
 using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder;
 using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
-using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.Metadata.Profiles.Exif;
 using SixLabors.ImageSharp.Metadata.Profiles.Icc;
@@ -32,20 +29,47 @@ internal sealed unsafe class JpegEncoderCore : IImageEncoderInternals
         private const int QuantizationTableCount = 2;
 
         /// <summary>
-        /// A scratch buffer to reduce allocations.
+        /// Gets the unscaled quantization tables in zig-zag order. Each
+        /// encoder copies and scales the tables according to its quality parameter.
+        /// The values are derived from section K.1 after converting from natural to
+        /// zig-zag order.
         /// </summary>
-        private readonly byte[] buffer = new byte[20];
+        // The C# compiler emits this as a compile-time constant embedded in the PE file.
+        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
+        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
+        private static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
+            {
+                // Luminance.
+                16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
+                40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
+                57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
+                109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
+                100, 120, 92, 101, 103, 99,
+            };
 
         /// <summary>
-        /// A buffer for reducing the number of stream writes when emitting Huffman tables. 64 seems to be enough.
+        /// Gets the unscaled quantization tables in zig-zag order. Each
+        /// encoder copies and scales the tables according to its quality parameter.
+        /// The values are derived from section K.1 after converting from natural to
+        /// zig-zag order.
         /// </summary>
-        private readonly byte[] emitBuffer = new byte[64];
+        // The C# compiler emits this as a compile-time constant embedded in the PE file.
+        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
+        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
+        private static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
+            {
+                // Chrominance.
+                17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+                99, 99, 99, 99, 99, 99, 99, 99,
+            };
 
         /// <summary>
-        /// A buffer for reducing the number of stream writes when emitting Huffman tables. Max combined table lengths +
-        /// identifier.
+        /// A scratch buffer to reduce allocations.
         /// </summary>
-        private readonly byte[] huffmanBuffer = new byte[179];
+        private readonly byte[] buffer = new byte[20];
 
         /// <summary>
         /// Gets or sets the subsampling method to use.
@@ -62,26 +86,6 @@ internal sealed unsafe class JpegEncoderCore : IImageEncoderInternals
         /// </summary>
         private readonly JpegColorType? colorType;
 
-        /// <summary>
-        /// The accumulated bits to write to the stream.
-        /// </summary>
-        private uint accumulatedBits;
-
-        /// <summary>
-        /// The accumulated bit count.
-        /// </summary>
-        private uint bitCount;
-
-        /// <summary>
-        /// The scaled chrominance table, in zig-zag order.
-        /// </summary>
-        private Block8x8F chrominanceQuantTable;
-
-        /// <summary>
-        /// The scaled luminance table, in zig-zag order.
-        /// </summary>
-        private Block8x8F luminanceQuantTable;
-
         /// <summary>
         /// The output stream. All attempted writes after the first error become no-ops.
         /// </summary>
@@ -98,67 +102,6 @@ public JpegEncoderCore(IJpegEncoderOptions options)
             this.colorType = options.ColorType;
         }
 
-        /// <summary>
-        /// Gets the counts the number of bits needed to hold an integer.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> BitCountLut => new byte[]
-            {
-                0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
-                5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                8, 8, 8,
-            };
-
-        /// <summary>
-        /// Gets the unscaled quantization tables in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from section K.1 after converting from natural to
-        /// zig-zag order.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> UnscaledQuant_Luminance => new byte[]
-            {
-                // Luminance.
-                16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24,
-                40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60,
-                57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80,
-                109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112,
-                100, 120, 92, 101, 103, 99,
-            };
-
-        /// <summary>
-        /// Gets the unscaled quantization tables in zig-zag order. Each
-        /// encoder copies and scales the tables according to its quality parameter.
-        /// The values are derived from section K.1 after converting from natural to
-        /// zig-zag order.
-        /// </summary>
-        // The C# compiler emits this as a compile-time constant embedded in the PE file.
-        // This is effectively compiled down to: return new ReadOnlySpan<byte>(&data, length)
-        // More details can be found: https://github.com/dotnet/roslyn/pull/24621
-        private static ReadOnlySpan<byte> UnscaledQuant_Chrominance => new byte[]
-            {
-                // Chrominance.
-                17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-                99, 99, 99, 99, 99, 99, 99, 99,
-            };
-
         /// <summary>
         /// Encode writes the image to the jpeg baseline format with the given options.
         /// </summary>
@@ -171,14 +114,14 @@ public void Encode<TPixel>(Image<TPixel> image, Stream stream, CancellationToken
         {
             Guard.NotNull(image, nameof(image));
             Guard.NotNull(stream, nameof(stream));
-            cancellationToken.ThrowIfCancellationRequested();
 
-            const ushort max = JpegConstants.MaxLength;
-            if (image.Width >= max || image.Height >= max)
+            if (image.Width >= JpegConstants.MaxLength || image.Height >= JpegConstants.MaxLength)
             {
-                throw new ImageFormatException($"Image is too large to encode at {image.Width}x{image.Height}.");
+                JpegThrowHelper.ThrowDimensionsTooLarge(image.Width, image.Height);
             }
 
+            cancellationToken.ThrowIfCancellationRequested();
+
             this.outputStream = stream;
             ImageMetadata metadata = image.Metadata;
 
@@ -201,10 +144,14 @@ public void Encode<TPixel>(Image<TPixel> image, Stream stream, CancellationToken
             }
 
             // Initialize the quantization tables.
-            InitQuantizationTable(0, scale, ref this.luminanceQuantTable);
+            // TODO: This looks ugly, should we write chrominance table for luminance-only images?
+            // If not - this can code can be simplified
+            Block8x8F luminanceQuantTable = default;
+            Block8x8F chrominanceQuantTable = default;
+            InitQuantizationTable(0, scale, ref luminanceQuantTable);
             if (componentCount > 1)
             {
-                InitQuantizationTable(1, scale, ref this.chrominanceQuantTable);
+                InitQuantizationTable(1, scale, ref chrominanceQuantTable);
             }
 
             // Write the Start Of Image marker.
@@ -214,7 +161,7 @@ public void Encode<TPixel>(Image<TPixel> image, Stream stream, CancellationToken
             this.WriteProfiles(metadata);
 
             // Write the quantization tables.
-            this.WriteDefineQuantizationTables();
+            this.WriteDefineQuantizationTables(ref luminanceQuantTable, ref chrominanceQuantTable);
 
             // Write the image dimensions.
             this.WriteStartOfFrame(image.Width, image.Height, componentCount);
@@ -222,13 +169,31 @@ public void Encode<TPixel>(Image<TPixel> image, Stream stream, CancellationToken
             // Write the Huffman tables.
             this.WriteDefineHuffmanTables(componentCount);
 
-            // Write the image data.
+            // Write the scan header.
             this.WriteStartOfScan(image, componentCount, cancellationToken);
 
+            // Write the scan compressed data.
+            var scanEncoder = new HuffmanScanEncoder(stream);
+            if (this.colorType == JpegColorType.Luminance)
+            {
+                scanEncoder.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken);
+            }
+            else
+            {
+                switch (subsample)
+                {
+                    case JpegSubsample.Ratio444:
+                        scanEncoder.Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                        break;
+                    case JpegSubsample.Ratio420:
+                        scanEncoder.Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken);
+                        break;
+                }
+            }
+
             // Write the End Of Image marker.
-            this.buffer[0] = JpegConstants.Markers.XFF;
-            this.buffer[1] = JpegConstants.Markers.EOI;
-            stream.Write(this.buffer, 0, 2);
+            this.WriteEndOfImageMarker();
+
             stream.Flush();
         }
 
@@ -248,248 +213,6 @@ private static void WriteDataToDqt(byte[] dqt, ref int offset, QuantIndex i, ref
             }
         }
 
-        /// <summary>
-        /// Initializes quantization table.
-        /// </summary>
-        /// <param name="i">The quantization index.</param>
-        /// <param name="scale">The scaling factor.</param>
-        /// <param name="quant">The quantization table.</param>
-        private static void InitQuantizationTable(int i, int scale, ref Block8x8F quant)
-        {
-            DebugGuard.MustBeBetweenOrEqualTo(i, 0, 1, nameof(i));
-            ReadOnlySpan<byte> unscaledQuant = (i == 0) ? UnscaledQuant_Luminance : UnscaledQuant_Chrominance;
-
-            for (int j = 0; j < Block8x8F.Size; j++)
-            {
-                int x = unscaledQuant[j];
-                x = ((x * scale) + 50) / 100;
-                if (x < 1)
-                {
-                    x = 1;
-                }
-
-                if (x > 255)
-                {
-                    x = 255;
-                }
-
-                quant[j] = x;
-            }
-        }
-
-        /// <summary>
-        /// Emits the least significant count of bits of bits to the bit-stream.
-        /// The precondition is bits
-        /// <example>
-        /// &lt; 1&lt;&lt;nBits &amp;&amp; nBits &lt;= 16
-        /// </example>
-        /// .
-        /// </summary>
-        /// <param name="bits">The packed bits.</param>
-        /// <param name="count">The number of bits</param>
-        /// <param name="emitBufferBase">The reference to the emitBuffer.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void Emit(uint bits, uint count, ref byte emitBufferBase)
-        {
-            count += this.bitCount;
-            bits <<= (int)(32 - count);
-            bits |= this.accumulatedBits;
-
-            // Only write if more than 8 bits.
-            if (count >= 8)
-            {
-                // Track length
-                int len = 0;
-                while (count >= 8)
-                {
-                    byte b = (byte)(bits >> 24);
-                    Unsafe.Add(ref emitBufferBase, len++) = b;
-                    if (b == byte.MaxValue)
-                    {
-                        Unsafe.Add(ref emitBufferBase, len++) = byte.MinValue;
-                    }
-
-                    bits <<= 8;
-                    count -= 8;
-                }
-
-                if (len > 0)
-                {
-                    this.outputStream.Write(this.emitBuffer, 0, len);
-                }
-            }
-
-            this.accumulatedBits = bits;
-            this.bitCount = count;
-        }
-
-        /// <summary>
-        /// Emits the given value with the given Huffman encoder.
-        /// </summary>
-        /// <param name="index">The index of the Huffman encoder</param>
-        /// <param name="value">The value to encode.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitHuff(HuffIndex index, int value, ref byte emitBufferBase)
-        {
-            uint x = HuffmanLut.TheHuffmanLut[(int)index].Values[value];
-            this.Emit(x & ((1 << 24) - 1), x >> 24, ref emitBufferBase);
-        }
-
-        /// <summary>
-        /// Emits a run of runLength copies of value encoded with the given Huffman encoder.
-        /// </summary>
-        /// <param name="index">The index of the Huffman encoder</param>
-        /// <param name="runLength">The number of copies to encode.</param>
-        /// <param name="value">The value to encode.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private void EmitHuffRLE(HuffIndex index, int runLength, int value, ref byte emitBufferBase)
-        {
-            int a = value;
-            int b = value;
-            if (a < 0)
-            {
-                a = -value;
-                b = value - 1;
-            }
-
-            uint bt;
-            if (a < 0x100)
-            {
-                bt = BitCountLut[a];
-            }
-            else
-            {
-                bt = 8 + (uint)BitCountLut[a >> 8];
-            }
-
-            this.EmitHuff(index, (int)((uint)(runLength << 4) | bt), ref emitBufferBase);
-            if (bt > 0)
-            {
-                this.Emit((uint)b & (uint)((1 << ((int)bt)) - 1), bt, ref emitBufferBase);
-            }
-        }
-
-        /// <summary>
-        /// Encodes the image with no subsampling.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        private void Encode444<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
-            // (Partially done with YCbCrForwardConverter<TPixel>)
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-            Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            // ReSharper disable once InconsistentNaming
-            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
-
-            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
-            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
-            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
-            RowOctet<TPixel> currentRows = default;
-
-            for (int y = 0; y < pixels.Height; y += 8)
-            {
-                cancellationToken.ThrowIfCancellationRequested();
-                currentRows.Update(pixelBuffer, y);
-
-                for (int x = 0; x < pixels.Width; x += 8)
-                {
-                    pixelConverter.Convert(frame, x, y, ref currentRows);
-
-                    prevDCY = this.WriteBlock(
-                        QuantIndex.Luminance,
-                        prevDCY,
-                        ref pixelConverter.Y,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackLuminanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-
-                    prevDCCb = this.WriteBlock(
-                        QuantIndex.Chrominance,
-                        prevDCCb,
-                        ref pixelConverter.Cb,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-
-                    prevDCCr = this.WriteBlock(
-                        QuantIndex.Chrominance,
-                        prevDCCr,
-                        ref pixelConverter.Cr,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-                }
-            }
-        }
-
-        /// <summary>
-        /// Encodes the image with no chroma, just luminance.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        private void EncodeGrayscale<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
-            // (Partially done with YCbCrForwardConverter<TPixel>)
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            // ReSharper disable once InconsistentNaming
-            int prevDCY = 0;
-
-            var pixelConverter = LuminanceForwardConverter<TPixel>.Create();
-            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
-            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
-            RowOctet<TPixel> currentRows = default;
-
-            for (int y = 0; y < pixels.Height; y += 8)
-            {
-                cancellationToken.ThrowIfCancellationRequested();
-                currentRows.Update(pixelBuffer, y);
-
-                for (int x = 0; x < pixels.Width; x += 8)
-                {
-                    pixelConverter.Convert(frame, x, y, ref currentRows);
-
-                    prevDCY = this.WriteBlock(
-                        QuantIndex.Luminance,
-                        prevDCY,
-                        ref pixelConverter.Y,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackLuminanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-                }
-            }
-        }
-
         /// <summary>
         /// Writes the application header containing the JFIF identifier plus extra data.
         /// </summary>
@@ -539,72 +262,6 @@ private void WriteApplicationHeader(ImageMetadata meta)
             this.outputStream.Write(this.buffer, 0, 20);
         }
 
-        /// <summary>
-        /// Writes a block of pixel data using the given quantization table,
-        /// returning the post-quantized DC value of the DCT-transformed block.
-        /// The block is in natural (not zig-zag) order.
-        /// </summary>
-        /// <param name="index">The quantization table index.</param>
-        /// <param name="prevDC">The previous DC value.</param>
-        /// <param name="src">Source block</param>
-        /// <param name="tempDest1">Temporal block to be used as FDCT Destination</param>
-        /// <param name="tempDest2">Temporal block 2</param>
-        /// <param name="quant">Quantization table</param>
-        /// <param name="unZig">The 8x8 Unzig block.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        /// <returns>The <see cref="int"/>.</returns>
-        private int WriteBlock(
-            QuantIndex index,
-            int prevDC,
-            ref Block8x8F src,
-            ref Block8x8F tempDest1,
-            ref Block8x8F tempDest2,
-            ref Block8x8F quant,
-            ref ZigZag unZig,
-            ref byte emitBufferBase)
-        {
-            FastFloatingPointDCT.TransformFDCT(ref src, ref tempDest1, ref tempDest2);
-
-            Block8x8F.Quantize(ref tempDest1, ref tempDest2, ref quant, ref unZig);
-
-            int dc = (int)tempDest2[0];
-
-            // Emit the DC delta.
-            this.EmitHuffRLE((HuffIndex)((2 * (int)index) + 0), 0, dc - prevDC, ref emitBufferBase);
-
-            // Emit the AC components.
-            var h = (HuffIndex)((2 * (int)index) + 1);
-            int runLength = 0;
-
-            for (int zig = 1; zig < Block8x8F.Size; zig++)
-            {
-                int ac = (int)tempDest2[zig];
-
-                if (ac == 0)
-                {
-                    runLength++;
-                }
-                else
-                {
-                    while (runLength > 15)
-                    {
-                        this.EmitHuff(h, 0xf0, ref emitBufferBase);
-                        runLength -= 16;
-                    }
-
-                    this.EmitHuffRLE(h, runLength, ac, ref emitBufferBase);
-                    runLength = 0;
-                }
-            }
-
-            if (runLength > 0)
-            {
-                this.EmitHuff(h, 0x00, ref emitBufferBase);
-            }
-
-            return dc;
-        }
-
         /// <summary>
         /// Writes the Define Huffman Table marker and tables.
         /// </summary>
@@ -638,34 +295,16 @@ private void WriteDefineHuffmanTables(int componentCount)
             this.WriteMarkerHeader(JpegConstants.Markers.DHT, markerlen);
             for (int i = 0; i < specs.Length; i++)
             {
-                ref HuffmanSpec spec = ref specs[i];
-                int len = 0;
-
-                fixed (byte* huffman = this.huffmanBuffer)
-                fixed (byte* count = spec.Count)
-                fixed (byte* values = spec.Values)
-                {
-                    huffman[len++] = headers[i];
-
-                    for (int c = 0; c < spec.Count.Length; c++)
-                    {
-                        huffman[len++] = count[c];
-                    }
-
-                    for (int v = 0; v < spec.Values.Length; v++)
-                    {
-                        huffman[len++] = values[v];
-                    }
-                }
-
-                this.outputStream.Write(this.huffmanBuffer, 0, len);
+                this.outputStream.WriteByte(headers[i]);
+                this.outputStream.Write(specs[i].Count);
+                this.outputStream.Write(specs[i].Values);
             }
         }
 
         /// <summary>
         /// Writes the Define Quantization Marker and tables.
         /// </summary>
-        private void WriteDefineQuantizationTables()
+        private void WriteDefineQuantizationTables(ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable)
         {
             // Marker + quantization table lengths
             int markerlen = 2 + (QuantizationTableCount * (1 + Block8x8F.Size));
@@ -677,8 +316,8 @@ private void WriteDefineQuantizationTables()
             byte[] dqt = new byte[dqtCount];
             int offset = 0;
 
-            WriteDataToDqt(dqt, ref offset, QuantIndex.Luminance, ref this.luminanceQuantTable);
-            WriteDataToDqt(dqt, ref offset, QuantIndex.Chrominance, ref this.chrominanceQuantTable);
+            WriteDataToDqt(dqt, ref offset, QuantIndex.Luminance, ref luminanceQuantTable);
+            WriteDataToDqt(dqt, ref offset, QuantIndex.Chrominance, ref chrominanceQuantTable);
 
             this.outputStream.Write(dqt, 0, dqtCount);
         }
@@ -982,7 +621,6 @@ private void WriteStartOfFrame(int width, int height, int componentCount)
         private void WriteStartOfScan<TPixel>(Image<TPixel> image, int componentCount, CancellationToken cancellationToken)
             where TPixel : unmanaged, IPixel<TPixel>
         {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
             Span<byte> componentId = stackalloc byte[]
             {
                 0x01,
@@ -1024,111 +662,16 @@ private void WriteStartOfScan<TPixel>(Image<TPixel> image, int componentCount, C
             this.buffer[sosSize] = 0x3f; // Se - End of spectral selection.
             this.buffer[sosSize + 1] = 0x00; // Ah + Ah (Successive approximation bit position high + low)
             this.outputStream.Write(this.buffer, 0, sosSize + 2);
-
-            ref byte emitBufferBase = ref MemoryMarshal.GetReference<byte>(this.emitBuffer);
-            if (this.colorType == JpegColorType.Luminance)
-            {
-                this.EncodeGrayscale(image, cancellationToken, ref emitBufferBase);
-            }
-            else
-            {
-                switch (this.subsample)
-                {
-                    case JpegSubsample.Ratio444:
-                        this.Encode444(image, cancellationToken, ref emitBufferBase);
-                        break;
-                    case JpegSubsample.Ratio420:
-                        this.Encode420(image, cancellationToken, ref emitBufferBase);
-                        break;
-                }
-            }
-
-            // Pad the last byte with 1's.
-            this.Emit(0x7f, 7, ref emitBufferBase);
         }
 
         /// <summary>
-        /// Encodes the image with subsampling. The Cb and Cr components are each subsampled
-        /// at a factor of 2 both horizontally and vertically.
+        /// Writes the EndOfImage marker.
         /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="pixels">The pixel accessor providing access to the image pixels.</param>
-        /// <param name="cancellationToken">The token to monitor for cancellation.</param>
-        /// <param name="emitBufferBase">The reference to the emit buffer.</param>
-        private void Encode420<TPixel>(Image<TPixel> pixels, CancellationToken cancellationToken, ref byte emitBufferBase)
-            where TPixel : unmanaged, IPixel<TPixel>
+        private void WriteEndOfImageMarker()
         {
-            // TODO: Need a JpegScanEncoder<TPixel> class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.)
-            Block8x8F b = default;
-            Span<Block8x8F> cb = stackalloc Block8x8F[4];
-            Span<Block8x8F> cr = stackalloc Block8x8F[4];
-
-            Block8x8F temp1 = default;
-            Block8x8F temp2 = default;
-
-            Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable;
-            Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable;
-
-            var unzig = ZigZag.CreateUnzigTable();
-
-            var pixelConverter = YCbCrForwardConverter<TPixel>.Create();
-
-            // ReSharper disable once InconsistentNaming
-            int prevDCY = 0, prevDCCb = 0, prevDCCr = 0;
-            ImageFrame<TPixel> frame = pixels.Frames.RootFrame;
-            Buffer2D<TPixel> pixelBuffer = frame.PixelBuffer;
-            RowOctet<TPixel> currentRows = default;
-
-            for (int y = 0; y < pixels.Height; y += 16)
-            {
-                cancellationToken.ThrowIfCancellationRequested();
-                for (int x = 0; x < pixels.Width; x += 16)
-                {
-                    for (int i = 0; i < 4; i++)
-                    {
-                        int xOff = (i & 1) * 8;
-                        int yOff = (i & 2) * 4;
-
-                        currentRows.Update(pixelBuffer, y + yOff);
-                        pixelConverter.Convert(frame, x + xOff, y + yOff, ref currentRows);
-
-                        cb[i] = pixelConverter.Cb;
-                        cr[i] = pixelConverter.Cr;
-
-                        prevDCY = this.WriteBlock(
-                            QuantIndex.Luminance,
-                            prevDCY,
-                            ref pixelConverter.Y,
-                            ref temp1,
-                            ref temp2,
-                            ref onStackLuminanceQuantTable,
-                            ref unzig,
-                            ref emitBufferBase);
-                    }
-
-                    Block8x8F.Scale16X16To8X8(ref b, cb);
-                    prevDCCb = this.WriteBlock(
-                        QuantIndex.Chrominance,
-                        prevDCCb,
-                        ref b,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-
-                    Block8x8F.Scale16X16To8X8(ref b, cr);
-                    prevDCCr = this.WriteBlock(
-                        QuantIndex.Chrominance,
-                        prevDCCr,
-                        ref b,
-                        ref temp1,
-                        ref temp2,
-                        ref onStackChrominanceQuantTable,
-                        ref unzig,
-                        ref emitBufferBase);
-                }
-            }
+            this.buffer[0] = JpegConstants.Markers.XFF;
+            this.buffer[1] = JpegConstants.Markers.EOI;
+            this.outputStream.Write(this.buffer, 0, 2);
         }
 
         /// <summary>
@@ -1145,5 +688,34 @@ private void WriteMarkerHeader(byte marker, int length)
             this.buffer[3] = (byte)(length & 0xff);
             this.outputStream.Write(this.buffer, 0, 4);
         }
+
+        /// <summary>
+        /// Initializes quantization table.
+        /// </summary>
+        /// <param name="i">The quantization index.</param>
+        /// <param name="scale">The scaling factor.</param>
+        /// <param name="quant">The quantization table.</param>
+        private static void InitQuantizationTable(int i, int scale, ref Block8x8F quant)
+        {
+            DebugGuard.MustBeBetweenOrEqualTo(i, 0, 1, nameof(i));
+            ReadOnlySpan<byte> unscaledQuant = (i == 0) ? UnscaledQuant_Luminance : UnscaledQuant_Chrominance;
+
+            for (int j = 0; j < Block8x8F.Size; j++)
+            {
+                int x = unscaledQuant[j];
+                x = ((x * scale) + 50) / 100;
+                if (x < 1)
+                {
+                    x = 1;
+                }
+
+                if (x > 255)
+                {
+                    x = 255;
+                }
+
+                quant[j] = x;
+            }
+        }
     }
 }
diff --git a/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs b/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs
index fa9eb83917..cc75870e19 100644
--- a/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs
@@ -46,5 +46,8 @@ public static void ThrowNotImplementedException(string errorMessage)
 
         [MethodImpl(InliningOptions.ColdPath)]
         public static void ThrowInvalidImageDimensions(int width, int height) => throw new InvalidImageContentException($"Invalid image dimensions: {width}x{height}.");
+
+        [MethodImpl(InliningOptions.ColdPath)]
+        public static void ThrowDimensionsTooLarge(int width, int height) => throw new ImageFormatException($"Image is too large to encode at {width}x{height} for JPEG format.");
     }
 }
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs
deleted file mode 100644
index ebd3e40130..0000000000
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using BenchmarkDotNet.Attributes;
-using SixLabors.ImageSharp.Formats.Jpeg.Components;
-
-namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components
-{
-    [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
-    public class Block8x8F_Scale16X16To8X8
-    {
-        private Block8x8F source;
-        private readonly Block8x8F[] target = new Block8x8F[4];
-
-        [GlobalSetup]
-        public void Setup()
-        {
-            var random = new Random();
-
-            float[] f = new float[8 * 8];
-            for (int i = 0; i < f.Length; i++)
-            {
-                f[i] = (float)random.NextDouble();
-            }
-
-            for (int i = 0; i < 4; i++)
-            {
-                this.target[i] = Block8x8F.Load(f);
-            }
-
-            this.source = Block8x8F.Load(f);
-        }
-
-        [Benchmark]
-        public void Scale16X16To8X8() => Block8x8F.Scale16X16To8X8(ref this.source, this.target);
-    }
-}
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
index 5a9ceea946..47c6f2c7d4 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs
@@ -4,6 +4,7 @@
 using System.Drawing.Imaging;
 using System.IO;
 using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.Formats.Jpeg;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Tests;
 using SDImage = System.Drawing.Image;
@@ -12,10 +13,22 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg
 {
     public class EncodeJpeg
     {
-        // System.Drawing needs this.
-        private Stream bmpStream;
+        [Params(75, 90, 100)]
+        public int Quality;
+
+        private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
+
+        // System.Drawing
         private SDImage bmpDrawing;
+        private Stream bmpStream;
+        private ImageCodecInfo jpegCodec;
+        private EncoderParameters encoderParameters;
+
+        // ImageSharp
         private Image<Rgba32> bmpCore;
+        private JpegEncoder encoder420;
+        private JpegEncoder encoder444;
+
         private MemoryStream destinationStream;
 
         [GlobalSetup]
@@ -23,12 +36,20 @@ public void ReadImages()
         {
             if (this.bmpStream == null)
             {
-                const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr;
                 this.bmpStream = File.OpenRead(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, TestImage));
+
                 this.bmpCore = Image.Load<Rgba32>(this.bmpStream);
                 this.bmpCore.Metadata.ExifProfile = null;
+                this.encoder420 = new JpegEncoder { Quality = this.Quality, Subsample = JpegSubsample.Ratio420 };
+                this.encoder444 = new JpegEncoder { Quality = this.Quality, Subsample = JpegSubsample.Ratio444 };
+
                 this.bmpStream.Position = 0;
                 this.bmpDrawing = SDImage.FromStream(this.bmpStream);
+                this.jpegCodec = GetEncoder(ImageFormat.Jpeg);
+                this.encoderParameters = new EncoderParameters(1);
+                // Quality cast to long is necessary
+                this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)this.Quality);
+
                 this.destinationStream = new MemoryStream();
             }
         }
@@ -38,36 +59,72 @@ public void Cleanup()
         {
             this.bmpStream.Dispose();
             this.bmpStream = null;
+
+            this.destinationStream.Dispose();
+            this.destinationStream = null;
+
             this.bmpCore.Dispose();
             this.bmpDrawing.Dispose();
+
+            this.encoderParameters.Dispose();
         }
 
-        [Benchmark(Baseline = true, Description = "System.Drawing Jpeg")]
+        [Benchmark(Baseline = true, Description = "System.Drawing Jpeg 4:2:0")]
         public void JpegSystemDrawing()
         {
-            this.bmpDrawing.Save(this.destinationStream, ImageFormat.Jpeg);
+            this.bmpDrawing.Save(this.destinationStream, this.jpegCodec, this.encoderParameters);
+            this.destinationStream.Seek(0, SeekOrigin.Begin);
+        }
+
+        [Benchmark(Description = "ImageSharp Jpeg 4:2:0")]
+        public void JpegCore420()
+        {
+            this.bmpCore.SaveAsJpeg(this.destinationStream, this.encoder420);
             this.destinationStream.Seek(0, SeekOrigin.Begin);
         }
 
-        [Benchmark(Description = "ImageSharp Jpeg")]
-        public void JpegCore()
+        [Benchmark(Description = "ImageSharp Jpeg 4:4:4")]
+        public void JpegCore444()
         {
-            this.bmpCore.SaveAsJpeg(this.destinationStream);
+            this.bmpCore.SaveAsJpeg(this.destinationStream, this.encoder444);
             this.destinationStream.Seek(0, SeekOrigin.Begin);
         }
+
+        // https://docs.microsoft.com/en-us/dotnet/api/system.drawing.imaging.encoderparameter?redirectedfrom=MSDN&view=net-5.0
+        private static ImageCodecInfo GetEncoder(ImageFormat format)
+        {
+            ImageCodecInfo[] codecs = ImageCodecInfo.GetImageDecoders();
+            foreach (ImageCodecInfo codec in codecs)
+            {
+                if (codec.FormatID == format.Guid)
+                {
+                    return codec;
+                }
+            }
+            return null;
+        }
     }
 }
 
 /*
-BenchmarkDotNet=v0.12.1, OS=Windows 10.0.18363.959 (1909/November2018Update/19H2)
-Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
-.NET Core SDK=3.1.302
-  [Host]     : .NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT
-  DefaultJob : .NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT
+BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042
+Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores
+.NET Core SDK=6.0.100-preview.3.21202.5
+  [Host]     : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT  [AttachedDebugger]
+  DefaultJob : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT
 
 
-|                Method |     Mean |     Error |    StdDev | Ratio | RatioSD |
-|---------------------- |---------:|----------:|----------:|------:|--------:|
-| 'System.Drawing Jpeg' | 4.297 ms | 0.0244 ms | 0.0228 ms |  1.00 |    0.00 |
-|     'ImageSharp Jpeg' | 5.286 ms | 0.1034 ms | 0.0967 ms |  1.23 |    0.02 |
+|                      Method | Quality |     Mean |    Error |   StdDev | Ratio | RatioSD |
+|---------------------------- |-------- |---------:|---------:|---------:|------:|--------:|
+| 'System.Drawing Jpeg 4:2:0' |      75 | 30.60 ms | 0.496 ms | 0.464 ms |  1.00 |    0.00 |
+|     'ImageSharp Jpeg 4:2:0' |      75 | 29.86 ms | 0.350 ms | 0.311 ms |  0.98 |    0.02 |
+|     'ImageSharp Jpeg 4:4:4' |      75 | 45.36 ms | 0.899 ms | 1.036 ms |  1.48 |    0.05 |
+|                             |         |          |          |          |       |         |
+| 'System.Drawing Jpeg 4:2:0' |      90 | 34.05 ms | 0.669 ms | 0.687 ms |  1.00 |    0.00 |
+|     'ImageSharp Jpeg 4:2:0' |      90 | 37.26 ms | 0.706 ms | 0.660 ms |  1.10 |    0.03 |
+|     'ImageSharp Jpeg 4:4:4' |      90 | 52.54 ms | 0.579 ms | 0.514 ms |  1.55 |    0.04 |
+|                             |         |          |          |          |       |         |
+| 'System.Drawing Jpeg 4:2:0' |     100 | 39.36 ms | 0.267 ms | 0.237 ms |  1.00 |    0.00 |
+|     'ImageSharp Jpeg 4:2:0' |     100 | 42.44 ms | 0.410 ms | 0.383 ms |  1.08 |    0.01 |
+|     'ImageSharp Jpeg 4:4:4' |     100 | 70.88 ms | 0.508 ms | 0.450 ms |  1.80 |    0.02 |
 */
diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
index 1db4072932..9aafb6936b 100644
--- a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
+++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs
@@ -37,7 +37,7 @@ public void ConvertLut()
             Block8x8F cb = default;
             Block8x8F cr = default;
 
-            this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
+            this.converter.Convert444(this.data.AsSpan(), ref y, ref cb, ref cr);
         }
 
         [Benchmark]
@@ -49,7 +49,7 @@ public void ConvertVectorized()
 
             if (RgbToYCbCrConverterVectorized.IsSupported)
             {
-                RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr);
+                RgbToYCbCrConverterVectorized.Convert444(this.data.AsSpan(), ref y, ref cb, ref cr);
             }
         }
     }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
index 75ad5427c7..d49a6498cd 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
@@ -2,10 +2,12 @@
 // Licensed under the Apache License, Version 2.0.
 
 using System;
-
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics.X86;
+#endif
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
-
+using SixLabors.ImageSharp.Tests.TestUtilities;
 using Xunit;
 using Xunit.Abstractions;
 
@@ -22,94 +24,180 @@ public FastFloatingPoint(ITestOutputHelper output)
             {
             }
 
-            [Fact]
-            public void IDCT2D8x4_LeftPart()
+            // Reference tests
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            [InlineData(3)]
+            public void LLM_TransformIDCT_CompareToNonOptimized(int seed)
             {
-                float[] sourceArray = Create8x8FloatData();
-                var expectedDestArray = new float[64];
+                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+
+                var source = Block8x8F.Load(sourceArray);
 
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(sourceArray, expectedDestArray);
+                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source);
+
+                var temp = default(Block8x8F);
+                var actual = default(Block8x8F);
+                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
 
-                var source = default(Block8x8F);
-                source.LoadFrom(sourceArray);
+                this.CompareBlocks(expected, actual, 1f);
+            }
 
-                var dest = default(Block8x8F);
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            [InlineData(3)]
+            public void LLM_TransformIDCT_CompareToAccurate(int seed)
+            {
+                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
 
-                FastFloatingPointDCT.IDCT8x4_LeftPart(ref source, ref dest);
+                var source = Block8x8F.Load(sourceArray);
 
-                var actualDestArray = new float[64];
-                dest.ScaledCopyTo(actualDestArray);
+                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source);
 
-                this.Print8x8Data(expectedDestArray);
-                this.Output.WriteLine("**************");
-                this.Print8x8Data(actualDestArray);
+                var temp = default(Block8x8F);
+                var actual = default(Block8x8F);
+                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
 
-                Assert.Equal(expectedDestArray, actualDestArray);
+                this.CompareBlocks(expected, actual, 1f);
             }
 
-            [Fact]
-            public void IDCT2D8x4_RightPart()
+            // Inverse transform
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void IDCT8x4_LeftPart(int seed)
             {
-                float[] sourceArray = Create8x8FloatData();
-                var expectedDestArray = new float[64];
-
-                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(sourceArray.AsSpan(4), expectedDestArray.AsSpan(4));
+                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                var srcBlock = default(Block8x8F);
+                srcBlock.LoadFrom(src);
 
-                var source = default(Block8x8F);
-                source.LoadFrom(sourceArray);
+                var destBlock = default(Block8x8F);
 
-                var dest = default(Block8x8F);
+                var expectedDest = new float[64];
 
-                FastFloatingPointDCT.IDCT8x4_RightPart(ref source, ref dest);
+                // reference
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
 
-                var actualDestArray = new float[64];
-                dest.ScaledCopyTo(actualDestArray);
+                // testee
+                FastFloatingPointDCT.IDCT8x4_LeftPart(ref srcBlock, ref destBlock);
 
-                this.Print8x8Data(expectedDestArray);
-                this.Output.WriteLine("**************");
-                this.Print8x8Data(actualDestArray);
+                var actualDest = new float[64];
+                destBlock.ScaledCopyTo(actualDest);
 
-                Assert.Equal(expectedDestArray, actualDestArray);
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
 
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
-            [InlineData(3)]
-            public void LLM_TransformIDCT_CompareToNonOptimized(int seed)
+            public void IDCT8x4_RightPart(int seed)
             {
-                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                var srcBlock = default(Block8x8F);
+                srcBlock.LoadFrom(src);
 
-                var source = Block8x8F.Load(sourceArray);
+                var destBlock = default(Block8x8F);
 
-                Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source);
+                var expectedDest = new float[64];
 
-                var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                // reference
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
 
-                this.CompareBlocks(expected, actual, 1f);
+                // testee
+                FastFloatingPointDCT.IDCT8x4_RightPart(ref srcBlock, ref destBlock);
+
+                var actualDest = new float[64];
+                destBlock.ScaledCopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
 
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
-            [InlineData(3)]
-            public void LLM_TransformIDCT_CompareToAccurate(int seed)
+            public void IDCT8x8_Avx(int seed)
             {
-                float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed);
+#if SUPPORTS_RUNTIME_INTRINSICS
+                var skip = !Avx.IsSupported;
+#else
+                var skip = true;
+#endif
+
+                if (skip)
+                {
+                    this.Output.WriteLine("No AVX present, skipping test!");
+                    return;
+                }
 
-                var source = Block8x8F.Load(sourceArray);
+                Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                var srcBlock = default(Block8x8F);
+                srcBlock.LoadFrom(src);
 
-                Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source);
+                var destBlock = default(Block8x8F);
 
-                var temp = default(Block8x8F);
-                var actual = default(Block8x8F);
-                FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp);
+                var expectedDest = new float[64];
 
-                this.CompareBlocks(expected, actual, 1f);
+                // reference, left part
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest);
+
+                // reference, right part
+                ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+
+                // testee, whole 8x8
+                FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock);
+
+                var actualDest = new float[64];
+                destBlock.ScaledCopyTo(actualDest);
+
+                Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
 
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void TransformIDCT(int seed)
+            {
+                static void RunTest(string serialized)
+                {
+                    int seed = FeatureTestRunner.Deserialize<int>(serialized);
+
+                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                    var srcBlock = default(Block8x8F);
+                    srcBlock.LoadFrom(src);
+
+                    var destBlock = default(Block8x8F);
+
+                    var expectedDest = new float[64];
+                    var temp1 = new float[64];
+                    var temp2 = default(Block8x8F);
+
+                    // reference
+                    ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1);
+
+                    // testee
+                    FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2);
+
+                    var actualDest = new float[64];
+                    destBlock.ScaledCopyTo(actualDest);
+
+                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                }
+
+                // 3 paths:
+                // 1. AllowAll - call avx/fma implementation
+                // 2. DisableFMA - call avx implementation without fma acceleration
+                // 3. DisableAvx - call fallback code of Vector4 implementation
+                //
+                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                    RunTest,
+                    seed,
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
+            }
+
+            // Forward transform
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
@@ -123,7 +211,10 @@ public void FDCT8x4_LeftPart(int seed)
 
                 var expectedDest = new float[64];
 
+                // reference
                 ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
+
+                // testee
                 FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock);
 
                 var actualDest = new float[64];
@@ -145,7 +236,10 @@ public void FDCT8x4_RightPart(int seed)
 
                 var expectedDest = new float[64];
 
+                // reference
                 ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+
+                // testee
                 FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock);
 
                 var actualDest = new float[64];
@@ -157,8 +251,19 @@ public void FDCT8x4_RightPart(int seed)
             [Theory]
             [InlineData(1)]
             [InlineData(2)]
-            public void TransformFDCT(int seed)
+            public void FDCT8x8_Avx(int seed)
             {
+#if SUPPORTS_RUNTIME_INTRINSICS
+                var skip = !Avx.IsSupported;
+#else
+                var skip = true;
+#endif
+                if (skip)
+                {
+                    this.Output.WriteLine("No AVX present, skipping test!");
+                    return;
+                }
+
                 Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
                 var srcBlock = default(Block8x8F);
                 srcBlock.LoadFrom(src);
@@ -166,17 +271,64 @@ public void TransformFDCT(int seed)
                 var destBlock = default(Block8x8F);
 
                 var expectedDest = new float[64];
-                var temp1 = new float[64];
-                var temp2 = default(Block8x8F);
 
-                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
-                FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
+                // reference, left part
+                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest);
+
+                // reference, right part
+                ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4));
+
+                // testee, whole 8x8
+                FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock);
 
                 var actualDest = new float[64];
                 destBlock.ScaledCopyTo(actualDest);
 
                 Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
             }
+
+            [Theory]
+            [InlineData(1)]
+            [InlineData(2)]
+            public void TransformFDCT(int seed)
+            {
+                static void RunTest(string serialized)
+                {
+                    int seed = FeatureTestRunner.Deserialize<int>(serialized);
+
+                    Span<float> src = Create8x8RoundedRandomFloatData(-200, 200, seed);
+                    var srcBlock = default(Block8x8F);
+                    srcBlock.LoadFrom(src);
+
+                    var destBlock = default(Block8x8F);
+
+                    var expectedDest = new float[64];
+                    var temp1 = new float[64];
+                    var temp2 = default(Block8x8F);
+
+                    // reference
+                    ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true);
+
+                    // testee
+                    FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false);
+
+                    var actualDest = new float[64];
+                    destBlock.ScaledCopyTo(actualDest);
+
+                    Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f));
+                }
+
+                // 3 paths:
+                // 1. AllowAll - call avx/fma implementation
+                // 2. DisableFMA - call avx implementation without fma acceleration
+                // 3. DisableAvx - call fallback code of Vector4 implementation
+                //
+                // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result
+                FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                    RunTest,
+                    seed,
+                    HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX);
+            }
         }
     }
 }
diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
index 9a6fc8d6fd..0d5b550384 100644
--- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs
@@ -1,7 +1,13 @@
-﻿// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
 using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 using SixLabors.ImageSharp.ColorSpaces;
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
 using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder;
@@ -23,22 +29,23 @@ public RgbToYCbCrConverterTests(ITestOutputHelper output)
         private ITestOutputHelper Output { get; }
 
         [Fact]
-        public void TestLutConverter()
+        public void TestConverterLut444()
         {
-            Rgb24[] data = CreateTestData();
+            int dataSize = 8 * 8;
+            Rgb24[] data = CreateTestData(dataSize);
             var target = RgbToYCbCrConverterLut.Create();
 
             Block8x8F y = default;
             Block8x8F cb = default;
             Block8x8F cr = default;
 
-            target.Convert(data.AsSpan(), ref y, ref cb, ref cr);
+            target.Convert444(data.AsSpan(), ref y, ref cb, ref cr);
 
-            Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F));
+            Verify444(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F));
         }
 
         [Fact]
-        public void TestVectorizedConverter()
+        public void TestConverterVectorized444()
         {
             if (!RgbToYCbCrConverterVectorized.IsSupported)
             {
@@ -46,18 +53,187 @@ public void TestVectorizedConverter()
                 return;
             }
 
-            Rgb24[] data = CreateTestData();
+            int dataSize = 8 * 8;
+            Rgb24[] data = CreateTestData(dataSize);
 
             Block8x8F y = default;
             Block8x8F cb = default;
             Block8x8F cr = default;
 
-            RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr);
+            RgbToYCbCrConverterVectorized.Convert444(data.AsSpan(), ref y, ref cb, ref cr);
 
-            Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
+            Verify444(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F));
         }
 
-        private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer)
+        [Fact]
+        public void TestConverterLut420()
+        {
+            int dataSize = 16 * 16;
+            Span<Rgb24> data = CreateTestData(dataSize).AsSpan();
+            var target = RgbToYCbCrConverterLut.Create();
+
+            var yBlocks = new Block8x8F[4];
+            var cb = default(Block8x8F);
+            var cr = default(Block8x8F);
+
+            target.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
+            target.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
+
+            Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F));
+        }
+
+        [Fact]
+        public void TestConverterVectorized420()
+        {
+            if (!RgbToYCbCrConverterVectorized.IsSupported)
+            {
+                this.Output.WriteLine("No AVX and/or FMA present, skipping test!");
+                return;
+            }
+
+            int dataSize = 16 * 16;
+            Span<Rgb24> data = CreateTestData(dataSize).AsSpan();
+
+            var yBlocks = new Block8x8F[4];
+            var cb = default(Block8x8F);
+            var cr = default(Block8x8F);
+
+            RgbToYCbCrConverterVectorized.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0);
+            RgbToYCbCrConverterVectorized.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1);
+
+            Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F));
+        }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Theory]
+        [InlineData(1)]
+        [InlineData(2)]
+        [InlineData(3)]
+        public void Scale16x2_8x1(int seed)
+        {
+            if (!Avx2.IsSupported)
+            {
+                return;
+            }
+
+            Span<float> data = new Random(seed).GenerateRandomFloatArray(Vector256<float>.Count * 4, -1000, 1000);
+
+            // Act:
+            Vector256<float> resultVector = RgbToYCbCrConverterVectorized.Scale16x2_8x1(MemoryMarshal.Cast<float, Vector256<float>>(data));
+            ref float result = ref Unsafe.As<Vector256<float>, float>(ref resultVector);
+
+            // Assert:
+            // Comparison epsilon is tricky but 10^(-4) is good enough (?)
+            var comparer = new ApproximateFloatComparer(0.0001f);
+            for (int i = 0; i < Vector256<float>.Count; i++)
+            {
+                float actual = Unsafe.Add(ref result, i);
+                float expected = CalculateAverage16x2_8x1(data, i);
+
+                Assert.True(comparer.Equals(actual, expected), $"Pos {i}, Expected: {expected}, Actual: {actual}");
+            }
+
+            static float CalculateAverage16x2_8x1(Span<float> data, int index)
+            {
+                int upIdx = index * 2;
+                int lowIdx = (index + 8) * 2;
+                return 0.25f * (data[upIdx] + data[upIdx + 1] + data[lowIdx] + data[lowIdx + 1]);
+            }
+        }
+#endif
+
+        private static void Verify444(
+            ReadOnlySpan<Rgb24> data,
+            ref Block8x8F yResult,
+            ref Block8x8F cbResult,
+            ref Block8x8F crResult,
+            ApproximateColorSpaceComparer comparer)
+        {
+            Block8x8F y = default;
+            Block8x8F cb = default;
+            Block8x8F cr = default;
+
+            RgbToYCbCr(data, ref y, ref cb, ref cr);
+
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                Assert.True(comparer.Equals(new YCbCr(y[i], cb[i], cr[i]), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y[i]} == {yResult[i]}, {cb[i]} == {cbResult[i]}, {cr[i]} == {crResult[i]}");
+            }
+        }
+
+        private static void Verify420(
+            ReadOnlySpan<Rgb24> data,
+            Block8x8F[] yResult,
+            ref Block8x8F cbResult,
+            ref Block8x8F crResult,
+            ApproximateFloatComparer comparer)
+        {
+            var trueBlock = default(Block8x8F);
+            var cbTrue = new Block8x8F[4];
+            var crTrue = new Block8x8F[4];
+
+            Span<Rgb24> tempData = new Rgb24[8 * 8].AsSpan();
+
+            // top left
+            Copy8x8(data, tempData);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[0], ref crTrue[0]);
+            VerifyBlock(ref yResult[0], ref trueBlock, comparer);
+
+            // top right
+            Copy8x8(data.Slice(8), tempData);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[1], ref crTrue[1]);
+            VerifyBlock(ref yResult[1], ref trueBlock, comparer);
+
+            // bottom left
+            Copy8x8(data.Slice(8 * 16), tempData);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[2], ref crTrue[2]);
+            VerifyBlock(ref yResult[2], ref trueBlock, comparer);
+
+            // bottom right
+            Copy8x8(data.Slice((8 * 16) + 8), tempData);
+            RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[3], ref crTrue[3]);
+            VerifyBlock(ref yResult[3], ref trueBlock, comparer);
+
+            // verify Cb
+            Scale16X16To8X8(ref trueBlock, cbTrue);
+            VerifyBlock(ref cbResult, ref trueBlock, comparer);
+
+            // verify Cr
+            Scale16X16To8X8(ref trueBlock, crTrue);
+            VerifyBlock(ref crResult, ref trueBlock, comparer);
+
+
+            // extracts 8x8 blocks from 16x8 memory region
+            static void Copy8x8(ReadOnlySpan<Rgb24> source, Span<Rgb24> dest)
+            {
+                for (int i = 0; i < 8; i++)
+                {
+                    source.Slice(i * 16, 8).CopyTo(dest.Slice(i * 8));
+                }
+            }
+
+            // scales 16x16 to 8x8, used in chroma subsampling tests
+            static void Scale16X16To8X8(ref Block8x8F dest, ReadOnlySpan<Block8x8F> source)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    int dstOff = ((i & 2) << 4) | ((i & 1) << 2);
+                    Block8x8F iSource = source[i];
+
+                    for (int y = 0; y < 4; y++)
+                    {
+                        for (int x = 0; x < 4; x++)
+                        {
+                            int j = (16 * y) + (2 * x);
+                            float sum = iSource[j] + iSource[j + 1] + iSource[j + 8] + iSource[j + 9];
+                            dest[(8 * y) + x + dstOff] = (sum + 2) * .25F;
+                        }
+                    }
+                }
+            }
+        }
+
+        private static void RgbToYCbCr(ReadOnlySpan<Rgb24> data, ref Block8x8F y, ref Block8x8F cb, ref Block8x8F cr)
         {
             for (int i = 0; i < data.Length; i++)
             {
@@ -65,17 +241,23 @@ private static void Verify(ReadOnlySpan<Rgb24> data, ref Block8x8F yResult, ref
                 int g = data[i].G;
                 int b = data[i].B;
 
-                float y = (0.299F * r) + (0.587F * g) + (0.114F * b);
-                float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
-                float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
+                y[i] = (0.299F * r) + (0.587F * g) + (0.114F * b);
+                cb[i] = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b));
+                cr[i] = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b));
+            }
+        }
 
-                Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}");
+        private static void VerifyBlock(ref Block8x8F res, ref Block8x8F target, ApproximateFloatComparer comparer)
+        {
+            for (int i = 0; i < Block8x8F.Size; i++)
+            {
+                Assert.True(comparer.Equals(res[i], target[i]), $"Pos {i}, Expected: {target[i]}, Got: {res[i]}");
             }
         }
 
-        private static Rgb24[] CreateTestData()
+        private static Rgb24[] CreateTestData(int size)
         {
-            var data = new Rgb24[64];
+            var data = new Rgb24[size];
             var r = new Random();
 
             var random = new byte[3];