diff --git a/shared-infrastructure b/shared-infrastructure index 48e73f455f..1f7ee70281 160000 --- a/shared-infrastructure +++ b/shared-infrastructure @@ -1 +1 @@ -Subproject commit 48e73f455f15eafefbe3175efc7433e5f277e506 +Subproject commit 1f7ee702812f3a1713ab7f749c0faae0ef139ed7 diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 0581993014..ef457f7ceb 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -23,6 +23,28 @@ internal static class Numerics private const int ShuffleAlphaControl = 0b_11_11_11_11; #endif +#if !SUPPORTS_BITOPERATIONS + /// + /// Gets the counts the number of bits needed to hold an integer. + /// + private static ReadOnlySpan BitCountLut => new byte[] + { + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, + }; +#endif + /// /// Determine the Greatest CommonDivisor (GCD) of two numbers. /// @@ -756,7 +778,7 @@ public static float Lerp(float value1, float value2, float amount) /// widening them to 32-bit integers and performing four additions. /// /// - /// byte(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) + /// byte(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16) /// is widened and added onto as such: /// /// accumulator += i32(1, 2, 3, 4); @@ -825,5 +847,26 @@ public static int EvenReduceSum(Vector256 accumulator) return Sse2.ConvertToInt32(vsum); } #endif + + /// + /// Calculates how many minimum bits needed to store given value. + /// + /// Unsigned integer to store + /// Minimum number of bits needed to store given value + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int MinimumBitsToStore16(uint number) + { +#if !SUPPORTS_BITOPERATIONS + if (number < 0x100) + { + return BitCountLut[(int)number]; + } + + return 8 + BitCountLut[(int)number >> 8]; +#else + const int bitInUnsignedInteger = sizeof(uint) * 8; + return bitInUnsignedInteger - BitOperations.LeadingZeroCount(number); +#endif + } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 4faf577fd9..b530a37e77 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -532,6 +532,7 @@ private static void Shuffle4Slice3( /// /// Performs a multiplication and an addition of the . /// + /// ret = (vm0 * vm1) + va /// The vector to add to the intermediate result. /// The first vector to multiply. /// The second vector to multiply. @@ -552,6 +553,30 @@ public static Vector256 MultiplyAdd( } } + /// + /// Performs a multiplication and a substraction of the . + /// + /// ret = (vm0 * vm1) - vs + /// The vector to substract from the intermediate result. + /// The first vector to multiply. + /// The second vector to multiply. + /// The . + [MethodImpl(InliningOptions.ShortMethod)] + public static Vector256 MultiplySubstract( + in Vector256 vs, + in Vector256 vm0, + in Vector256 vm1) + { + if (Fma.IsSupported) + { + return Fma.MultiplySubtract(vm1, vm0, vs); + } + else + { + return Avx.Subtract(Avx.Multiply(vm0, vm1), vs); + } + } + /// /// as many elements as possible, slicing them down (keeping the remainder). /// diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 2d19f5ce26..8ca7b0c801 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Represents a Jpeg block with coefficients. /// - [StructLayout(LayoutKind.Sequential)] + [StructLayout(LayoutKind.Explicit)] internal partial struct Block8x8F : IEquatable { /// @@ -27,29 +27,69 @@ internal partial struct Block8x8F : IEquatable public const int Size = 64; #pragma warning disable SA1600 // ElementsMustBeDocumented + [FieldOffset(0)] public Vector4 V0L; + [FieldOffset(16)] public Vector4 V0R; + [FieldOffset(32)] public Vector4 V1L; + [FieldOffset(48)] public Vector4 V1R; + [FieldOffset(64)] public Vector4 V2L; + [FieldOffset(80)] public Vector4 V2R; + [FieldOffset(96)] public Vector4 V3L; + [FieldOffset(112)] public Vector4 V3R; + [FieldOffset(128)] public Vector4 V4L; + [FieldOffset(144)] public Vector4 V4R; + [FieldOffset(160)] public Vector4 V5L; + [FieldOffset(176)] public Vector4 V5R; + [FieldOffset(192)] public Vector4 V6L; + [FieldOffset(208)] public Vector4 V6R; + [FieldOffset(224)] public Vector4 V7L; + [FieldOffset(240)] public Vector4 V7R; + +#if SUPPORTS_RUNTIME_INTRINSICS + /// + /// A number of rows of 8 scalar coefficients each in + /// + public const int RowCount = 8; + + [FieldOffset(0)] + public Vector256 V0; + [FieldOffset(32)] + public Vector256 V1; + [FieldOffset(64)] + public Vector256 V2; + [FieldOffset(96)] + public Vector256 V3; + [FieldOffset(128)] + public Vector256 V4; + [FieldOffset(160)] + public Vector256 V5; + [FieldOffset(192)] + public Vector256 V6; + [FieldOffset(224)] + public Vector256 V7; +#endif #pragma warning restore SA1600 // ElementsMustBeDocumented /// @@ -278,14 +318,14 @@ public void MultiplyInPlace(float value) if (Avx.IsSupported) { var valueVec = Vector256.Create(value); - Unsafe.As>(ref this.V0L) = Avx.Multiply(Unsafe.As>(ref this.V0L), valueVec); - Unsafe.As>(ref this.V1L) = Avx.Multiply(Unsafe.As>(ref this.V1L), valueVec); - Unsafe.As>(ref this.V2L) = Avx.Multiply(Unsafe.As>(ref this.V2L), valueVec); - Unsafe.As>(ref this.V3L) = Avx.Multiply(Unsafe.As>(ref this.V3L), valueVec); - Unsafe.As>(ref this.V4L) = Avx.Multiply(Unsafe.As>(ref this.V4L), valueVec); - Unsafe.As>(ref this.V5L) = Avx.Multiply(Unsafe.As>(ref this.V5L), valueVec); - Unsafe.As>(ref this.V6L) = Avx.Multiply(Unsafe.As>(ref this.V6L), valueVec); - Unsafe.As>(ref this.V7L) = Avx.Multiply(Unsafe.As>(ref this.V7L), valueVec); + this.V0 = Avx.Multiply(this.V0, valueVec); + this.V1 = Avx.Multiply(this.V1, valueVec); + this.V2 = Avx.Multiply(this.V2, valueVec); + this.V3 = Avx.Multiply(this.V3, valueVec); + this.V4 = Avx.Multiply(this.V4, valueVec); + this.V5 = Avx.Multiply(this.V5, valueVec); + this.V6 = Avx.Multiply(this.V6, valueVec); + this.V7 = Avx.Multiply(this.V7, valueVec); } else #endif @@ -319,45 +359,14 @@ public unsafe void MultiplyInPlace(ref Block8x8F other) #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { - Unsafe.As>(ref this.V0L) - = Avx.Multiply( - Unsafe.As>(ref this.V0L), - Unsafe.As>(ref other.V0L)); - - Unsafe.As>(ref this.V1L) - = Avx.Multiply( - Unsafe.As>(ref this.V1L), - Unsafe.As>(ref other.V1L)); - - Unsafe.As>(ref this.V2L) - = Avx.Multiply( - Unsafe.As>(ref this.V2L), - Unsafe.As>(ref other.V2L)); - - Unsafe.As>(ref this.V3L) - = Avx.Multiply( - Unsafe.As>(ref this.V3L), - Unsafe.As>(ref other.V3L)); - - Unsafe.As>(ref this.V4L) - = Avx.Multiply( - Unsafe.As>(ref this.V4L), - Unsafe.As>(ref other.V4L)); - - Unsafe.As>(ref this.V5L) - = Avx.Multiply( - Unsafe.As>(ref this.V5L), - Unsafe.As>(ref other.V5L)); - - Unsafe.As>(ref this.V6L) - = Avx.Multiply( - Unsafe.As>(ref this.V6L), - Unsafe.As>(ref other.V6L)); - - Unsafe.As>(ref this.V7L) - = Avx.Multiply( - Unsafe.As>(ref this.V7L), - Unsafe.As>(ref other.V7L)); + this.V0 = Avx.Multiply(this.V0, other.V0); + this.V1 = Avx.Multiply(this.V1, other.V1); + this.V2 = Avx.Multiply(this.V2, other.V2); + this.V3 = Avx.Multiply(this.V3, other.V3); + this.V4 = Avx.Multiply(this.V4, other.V4); + this.V5 = Avx.Multiply(this.V5, other.V5); + this.V6 = Avx.Multiply(this.V6, other.V6); + this.V7 = Avx.Multiply(this.V7, other.V7); } else #endif @@ -392,14 +401,14 @@ public void AddInPlace(float value) if (Avx.IsSupported) { var valueVec = Vector256.Create(value); - Unsafe.As>(ref this.V0L) = Avx.Add(Unsafe.As>(ref this.V0L), valueVec); - Unsafe.As>(ref this.V1L) = Avx.Add(Unsafe.As>(ref this.V1L), valueVec); - Unsafe.As>(ref this.V2L) = Avx.Add(Unsafe.As>(ref this.V2L), valueVec); - Unsafe.As>(ref this.V3L) = Avx.Add(Unsafe.As>(ref this.V3L), valueVec); - Unsafe.As>(ref this.V4L) = Avx.Add(Unsafe.As>(ref this.V4L), valueVec); - Unsafe.As>(ref this.V5L) = Avx.Add(Unsafe.As>(ref this.V5L), valueVec); - Unsafe.As>(ref this.V6L) = Avx.Add(Unsafe.As>(ref this.V6L), valueVec); - Unsafe.As>(ref this.V7L) = Avx.Add(Unsafe.As>(ref this.V7L), valueVec); + this.V0 = Avx.Add(this.V0, valueVec); + this.V1 = Avx.Add(this.V1, valueVec); + this.V2 = Avx.Add(this.V2, valueVec); + this.V3 = Avx.Add(this.V3, valueVec); + this.V4 = Avx.Add(this.V4, valueVec); + this.V5 = Avx.Add(this.V5, valueVec); + this.V6 = Avx.Add(this.V6, valueVec); + this.V7 = Avx.Add(this.V7, valueVec); } else #endif @@ -468,81 +477,6 @@ public static unsafe void Quantize( DivideRoundAll(ref dest, ref qt); } - /// - /// Scales the 16x16 region represented by the 4 source blocks to the 8x8 DST block. - /// - /// The destination block. - /// The source block. - public static unsafe void Scale16X16To8X8(ref Block8x8F destination, ReadOnlySpan source) - { -#if SUPPORTS_RUNTIME_INTRINSICS - if (Avx2.IsSupported) - { - Scale16X16To8X8Vectorized(ref destination, source); - return; - } -#endif - - Scale16X16To8X8Scalar(ref destination, source); - } - - private static void Scale16X16To8X8Vectorized(ref Block8x8F destination, ReadOnlySpan source) - { -#if SUPPORTS_RUNTIME_INTRINSICS - Debug.Assert(Avx2.IsSupported, "AVX2 is required to execute this method"); - - var f2 = Vector256.Create(2f); - var f025 = Vector256.Create(0.25f); - Vector256 switchInnerDoubleWords = Unsafe.As>(ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskSwitchInnerDWords8x32)); - ref Vector256 destRef = ref Unsafe.As>(ref destination); - - for (int i = 0; i < 2; i++) - { - ref Vector256 in1 = ref Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), 2 * i)); - ref Vector256 in2 = ref Unsafe.As>(ref Unsafe.Add(ref MemoryMarshal.GetReference(source), (2 * i) + 1)); - - for (int j = 0; j < 8; j += 2) - { - Vector256 a = Unsafe.Add(ref in1, j); - Vector256 b = Unsafe.Add(ref in1, j + 1); - Vector256 c = Unsafe.Add(ref in2, j); - Vector256 d = Unsafe.Add(ref in2, j + 1); - - Vector256 calc1 = Avx.Shuffle(a, c, 0b10_00_10_00); - Vector256 calc2 = Avx.Shuffle(a, c, 0b11_01_11_01); - Vector256 calc3 = Avx.Shuffle(b, d, 0b10_00_10_00); - Vector256 calc4 = Avx.Shuffle(b, d, 0b11_01_11_01); - - Vector256 sum = Avx.Add(Avx.Add(calc1, calc2), Avx.Add(calc3, calc4)); - Vector256 add = Avx.Add(sum, f2); - Vector256 res = Avx.Multiply(add, f025); - - destRef = Avx2.PermuteVar8x32(res, switchInnerDoubleWords); - destRef = ref Unsafe.Add(ref destRef, 1); - } - } -#endif - } - - private static unsafe void Scale16X16To8X8Scalar(ref Block8x8F destination, ReadOnlySpan source) - { - for (int i = 0; i < 4; i++) - { - int dstOff = ((i & 2) << 4) | ((i & 1) << 2); - Block8x8F iSource = source[i]; - - for (int y = 0; y < 4; y++) - { - for (int x = 0; x < 4; x++) - { - int j = (16 * y) + (2 * x); - float sum = iSource[j] + iSource[j + 1] + iSource[j + 8] + iSource[j + 9]; - destination[(8 * y) + x + dstOff] = (sum + 2) * .25F; - } - } - } - } - [MethodImpl(InliningOptions.ShortMethod)] private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b) { @@ -553,19 +487,13 @@ private static void DivideRoundAll(ref Block8x8F a, ref Block8x8F b) var vadd = Vector256.Create(.5F); var vone = Vector256.Create(1f); - ref Vector256 aBase = ref Unsafe.AsRef(Unsafe.As>(ref a.V0L)); - ref Vector256 bBase = ref Unsafe.AsRef(Unsafe.As>(ref b.V0L)); - ref Vector256 aEnd = ref Unsafe.Add(ref aBase, 8); - - do + for (int i = 0; i < RowCount; i++) { - Vector256 voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aBase), vone), vadd); - Unsafe.Add(ref aBase, 0) = Avx.Add(Avx.Divide(aBase, bBase), voff); - - aBase = ref Unsafe.Add(ref aBase, 1); - bBase = ref Unsafe.Add(ref bBase, 1); + ref Vector256 aRow = ref Unsafe.Add(ref a.V0, i); + ref Vector256 bRow = ref Unsafe.Add(ref b.V0, i); + Vector256 voff = Avx.Multiply(Avx.Min(Avx.Max(vnegOne, aRow), vone), vadd); + aRow = Avx.Add(Avx.Divide(aRow, bRow), voff); } - while (Unsafe.IsAddressLessThan(ref aBase, ref aEnd)); } else #endif @@ -805,26 +733,26 @@ public void TransposeInto(ref Block8x8F d) Vector256 t0 = Avx.UnpackLow(r0, r1); Vector256 t2 = Avx.UnpackLow(r2, r3); Vector256 v = Avx.Shuffle(t0, t2, 0x4E); - Unsafe.As>(ref d.V0L) = Avx.Blend(t0, v, 0xCC); - Unsafe.As>(ref d.V1L) = Avx.Blend(t2, v, 0x33); + d.V0 = Avx.Blend(t0, v, 0xCC); + d.V1 = Avx.Blend(t2, v, 0x33); Vector256 t4 = Avx.UnpackLow(r4, r5); Vector256 t6 = Avx.UnpackLow(r6, r7); v = Avx.Shuffle(t4, t6, 0x4E); - Unsafe.As>(ref d.V4L) = Avx.Blend(t4, v, 0xCC); - Unsafe.As>(ref d.V5L) = Avx.Blend(t6, v, 0x33); + d.V4 = Avx.Blend(t4, v, 0xCC); + d.V5 = Avx.Blend(t6, v, 0x33); Vector256 t1 = Avx.UnpackHigh(r0, r1); Vector256 t3 = Avx.UnpackHigh(r2, r3); v = Avx.Shuffle(t1, t3, 0x4E); - Unsafe.As>(ref d.V2L) = Avx.Blend(t1, v, 0xCC); - Unsafe.As>(ref d.V3L) = Avx.Blend(t3, v, 0x33); + d.V2 = Avx.Blend(t1, v, 0xCC); + d.V3 = Avx.Blend(t3, v, 0x33); Vector256 t5 = Avx.UnpackHigh(r4, r5); Vector256 t7 = Avx.UnpackHigh(r6, r7); v = Avx.Shuffle(t5, t7, 0x4E); - Unsafe.As>(ref d.V6L) = Avx.Blend(t5, v, 0xCC); - Unsafe.As>(ref d.V7L) = Avx.Blend(t7, v, 0x33); + d.V6 = Avx.Blend(t5, v, 0xCC); + d.V7 = Avx.Blend(t7, v, 0x33); } else #endif diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs index bc2c7634b5..bc6c8c6cc7 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanLut.cs @@ -1,4 +1,4 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder @@ -44,7 +44,7 @@ public HuffmanLut(HuffmanSpec spec) } } - this.Values = new uint[maxValue + 1]; + this.Values = new int[maxValue + 1]; int code = 0; int k = 0; @@ -54,7 +54,7 @@ public HuffmanLut(HuffmanSpec spec) int bits = (i + 1) << 24; for (int j = 0; j < spec.Count[i]; j++) { - this.Values[spec.Values[k]] = (uint)(bits | code); + this.Values[spec.Values[k]] = bits | code; code++; k++; } @@ -66,6 +66,6 @@ public HuffmanLut(HuffmanSpec spec) /// /// Gets the collection of huffman values. /// - public uint[] Values { get; } + public int[] Values { get; } } -} \ No newline at end of file +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs new file mode 100644 index 0000000000..ca352397b8 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/HuffmanScanEncoder.cs @@ -0,0 +1,392 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.IO; +using System.Runtime.CompilerServices; +using System.Threading; +using SixLabors.ImageSharp.Memory; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + internal class HuffmanScanEncoder + { + /// + /// Number of bytes cached before being written to target stream via Stream.Write(byte[], offest, count). + /// + /// + /// This is subject to change, 1024 seems to be the best value in terms of performance. + /// expects it to be at least 8 (see comments in method body). + /// + private const int EmitBufferSizeInBytes = 1024; + + /// + /// A buffer for reducing the number of stream writes when emitting Huffman tables. + /// + private readonly byte[] emitBuffer = new byte[EmitBufferSizeInBytes]; + + /// + /// Number of filled bytes in buffer + /// + private int emitLen = 0; + + /// + /// Emmited bits 'micro buffer' before being transfered to the . + /// + private int accumulatedBits; + + /// + /// Number of jagged bits stored in + /// + private int bitCount; + + private Block8x8F temporalBlock1; + private Block8x8F temporalBlock2; + + /// + /// The output stream. All attempted writes after the first error become no-ops. + /// + private readonly Stream target; + + public HuffmanScanEncoder(Stream outputStream) + { + this.target = outputStream; + } + + /// + /// Encodes the image with no subsampling. + /// + /// The pixel format. + /// The pixel accessor providing access to the image pixels. + /// Luminance quantization table provided by the callee + /// Chrominance quantization table provided by the callee + /// The token to monitor for cancellation. + public void Encode444(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) + where TPixel : unmanaged, IPixel + { + var unzig = ZigZag.CreateUnzigTable(); + + // ReSharper disable once InconsistentNaming + int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; + + ImageFrame frame = pixels.Frames.RootFrame; + Buffer2D pixelBuffer = frame.PixelBuffer; + RowOctet currentRows = default; + + var pixelConverter = new YCbCrForwardConverter444(frame); + + for (int y = 0; y < pixels.Height; y += 8) + { + cancellationToken.ThrowIfCancellationRequested(); + currentRows.Update(pixelBuffer, y); + + for (int x = 0; x < pixels.Width; x += 8) + { + pixelConverter.Convert(x, y, ref currentRows); + + prevDCY = this.WriteBlock( + QuantIndex.Luminance, + prevDCY, + ref pixelConverter.Y, + ref luminanceQuantTable, + ref unzig); + + prevDCCb = this.WriteBlock( + QuantIndex.Chrominance, + prevDCCb, + ref pixelConverter.Cb, + ref chrominanceQuantTable, + ref unzig); + + prevDCCr = this.WriteBlock( + QuantIndex.Chrominance, + prevDCCr, + ref pixelConverter.Cr, + ref chrominanceQuantTable, + ref unzig); + } + } + + this.FlushInternalBuffer(); + } + + /// + /// Encodes the image with subsampling. The Cb and Cr components are each subsampled + /// at a factor of 2 both horizontally and vertically. + /// + /// The pixel format. + /// The pixel accessor providing access to the image pixels. + /// Luminance quantization table provided by the callee + /// Chrominance quantization table provided by the callee + /// The token to monitor for cancellation. + public void Encode420(Image pixels, ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable, CancellationToken cancellationToken) + where TPixel : unmanaged, IPixel + { + var unzig = ZigZag.CreateUnzigTable(); + + // ReSharper disable once InconsistentNaming + int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; + ImageFrame frame = pixels.Frames.RootFrame; + Buffer2D pixelBuffer = frame.PixelBuffer; + RowOctet currentRows = default; + + var pixelConverter = new YCbCrForwardConverter420(frame); + + for (int y = 0; y < pixels.Height; y += 16) + { + cancellationToken.ThrowIfCancellationRequested(); + for (int x = 0; x < pixels.Width; x += 16) + { + for (int i = 0; i < 2; i++) + { + int yOff = i * 8; + currentRows.Update(pixelBuffer, y + yOff); + pixelConverter.Convert(x, y, ref currentRows, i); + + prevDCY = this.WriteBlock( + QuantIndex.Luminance, + prevDCY, + ref pixelConverter.YLeft, + ref luminanceQuantTable, + ref unzig); + + prevDCY = this.WriteBlock( + QuantIndex.Luminance, + prevDCY, + ref pixelConverter.YRight, + ref luminanceQuantTable, + ref unzig); + } + + prevDCCb = this.WriteBlock( + QuantIndex.Chrominance, + prevDCCb, + ref pixelConverter.Cb, + ref chrominanceQuantTable, + ref unzig); + + prevDCCr = this.WriteBlock( + QuantIndex.Chrominance, + prevDCCr, + ref pixelConverter.Cr, + ref chrominanceQuantTable, + ref unzig); + } + } + + this.FlushInternalBuffer(); + } + + /// + /// Encodes the image with no chroma, just luminance. + /// + /// The pixel format. + /// The pixel accessor providing access to the image pixels. + /// Luminance quantization table provided by the callee + /// The token to monitor for cancellation. + public void EncodeGrayscale(Image pixels, ref Block8x8F luminanceQuantTable, CancellationToken cancellationToken) + where TPixel : unmanaged, IPixel + { + var unzig = ZigZag.CreateUnzigTable(); + + // ReSharper disable once InconsistentNaming + int prevDCY = 0; + + var pixelConverter = LuminanceForwardConverter.Create(); + ImageFrame frame = pixels.Frames.RootFrame; + Buffer2D pixelBuffer = frame.PixelBuffer; + RowOctet currentRows = default; + + for (int y = 0; y < pixels.Height; y += 8) + { + cancellationToken.ThrowIfCancellationRequested(); + currentRows.Update(pixelBuffer, y); + + for (int x = 0; x < pixels.Width; x += 8) + { + pixelConverter.Convert(frame, x, y, ref currentRows); + + prevDCY = this.WriteBlock( + QuantIndex.Luminance, + prevDCY, + ref pixelConverter.Y, + ref luminanceQuantTable, + ref unzig); + } + } + + this.FlushInternalBuffer(); + } + + /// + /// Writes a block of pixel data using the given quantization table, + /// returning the post-quantized DC value of the DCT-transformed block. + /// The block is in natural (not zig-zag) order. + /// + /// The quantization table index. + /// The previous DC value. + /// Source block + /// Quantization table + /// The 8x8 Unzig block. + /// The . + private int WriteBlock( + QuantIndex index, + int prevDC, + ref Block8x8F src, + ref Block8x8F quant, + ref ZigZag unZig) + { + ref Block8x8F refTemp1 = ref this.temporalBlock1; + ref Block8x8F refTemp2 = ref this.temporalBlock2; + + FastFloatingPointDCT.TransformFDCT(ref src, ref refTemp1, ref refTemp2); + + Block8x8F.Quantize(ref refTemp1, ref refTemp2, ref quant, ref unZig); + + int dc = (int)refTemp2[0]; + + // Emit the DC delta. + this.EmitHuffRLE((2 * (int)index) + 0, 0, dc - prevDC); + + // Emit the AC components. + int h = (2 * (int)index) + 1; + int runLength = 0; + + for (int zig = 1; zig < Block8x8F.Size; zig++) + { + int ac = (int)refTemp2[zig]; + + if (ac == 0) + { + runLength++; + } + else + { + while (runLength > 15) + { + this.EmitHuff(h, 0xf0); + runLength -= 16; + } + + this.EmitHuffRLE(h, runLength, ac); + runLength = 0; + } + } + + if (runLength > 0) + { + this.EmitHuff(h, 0x00); + } + + return dc; + } + + /// + /// Emits the least significant count of bits to the stream write buffer. + /// The precondition is bits + /// + /// < 1<<nBits && nBits <= 16 + /// + /// . + /// + /// The packed bits. + /// The number of bits + [MethodImpl(InliningOptions.ShortMethod)] + private void Emit(int bits, int count) + { + count += this.bitCount; + bits <<= 32 - count; + bits |= this.accumulatedBits; + + // Only write if more than 8 bits. + if (count >= 8) + { + // Track length + while (count >= 8) + { + byte b = (byte)(bits >> 24); + this.emitBuffer[this.emitLen++] = b; + if (b == byte.MaxValue) + { + this.emitBuffer[this.emitLen++] = byte.MinValue; + } + + bits <<= 8; + count -= 8; + } + + // This can emit 4 times of: + // 1 byte guaranteed + // 1 extra byte.MinValue byte if previous one was byte.MaxValue + // Thus writing (1 + 1) * 4 = 8 bytes max + // So we must check if emit buffer has extra 8 bytes, if not - call stream.Write + if (this.emitLen > EmitBufferSizeInBytes - 8) + { + this.target.Write(this.emitBuffer, 0, this.emitLen); + this.emitLen = 0; + } + } + + this.accumulatedBits = bits; + this.bitCount = count; + } + + /// + /// Emits the given value with the given Huffman encoder. + /// + /// The index of the Huffman encoder + /// The value to encode. + [MethodImpl(InliningOptions.ShortMethod)] + private void EmitHuff(int index, int value) + { + int x = HuffmanLut.TheHuffmanLut[index].Values[value]; + this.Emit(x & ((1 << 24) - 1), x >> 24); + } + + /// + /// Emits a run of runLength copies of value encoded with the given Huffman encoder. + /// + /// The index of the Huffman encoder + /// The number of copies to encode. + /// The value to encode. + [MethodImpl(InliningOptions.ShortMethod)] + private void EmitHuffRLE(int index, int runLength, int value) + { + int a = value; + int b = value; + if (a < 0) + { + a = -value; + b = value - 1; + } + + int bt = Numerics.MinimumBitsToStore16((uint)a); + + this.EmitHuff(index, (runLength << 4) | bt); + if (bt > 0) + { + this.Emit(b & ((1 << bt) - 1), bt); + } + } + + /// + /// Writes remaining bytes from internal buffer to the target stream. + /// + /// Pads last byte with 1's if necessary + private void FlushInternalBuffer() + { + // pad last byte with 1's + int padBitsCount = 8 - (this.bitCount % 8); + if (padBitsCount != 0) + { + this.Emit((1 << padBitsCount) - 1, padBitsCount); + } + + // flush remaining bytes + if (this.emitLen != 0) + { + this.target.Write(this.emitBuffer, 0, this.emitLen); + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs index cc81130dd7..fc5b9a8682 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/LuminanceForwardConverter{TPixel}.cs @@ -49,7 +49,7 @@ public void Convert(ImageFrame frame, int x, int y, ref RowOctet ref Block8x8F yBlock = ref this.Y; ref L8 l8Start = ref l8Span[0]; - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { ref L8 c = ref Unsafe.Add(ref l8Start, i); yBlock[i] = c.PackedValue; diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs index 3c1a02c5aa..15574a32a2 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterLut.cs @@ -92,48 +92,144 @@ public static RgbToYCbCrConverterLut Create() return tables; } - /// - /// Optimized method to allocates the correct y, cb, and cr values to the DCT blocks from the given r, g, b values. - /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - private void ConvertPixelInto( - int r, - int g, - int b, - ref Block8x8F yResult, - ref Block8x8F cbResult, - ref Block8x8F crResult, - int i) + private float CalculateY(byte r, byte g, byte b) { // float y = (0.299F * r) + (0.587F * g) + (0.114F * b); - yResult[i] = (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits; + return (this.YRTable[r] + this.YGTable[g] + this.YBTable[b]) >> ScaleBits; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private float CalculateCb(byte r, byte g, byte b) + { // float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); - cbResult[i] = (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; + return (this.CbRTable[r] + this.CbGTable[g] + this.CbBTable[b]) >> ScaleBits; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private float CalculateCr(byte r, byte g, byte b) + { // float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); - crResult[i] = (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; + return (this.CbBTable[r] + this.CrGTable[g] + this.CrBTable[b]) >> ScaleBits; } - public void Convert(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + /// + /// Converts Rgb24 pixels into YCbCr color space with 4:4:4 subsampling sampling of luminance and chroma. + /// + /// Span of Rgb24 pixel data + /// Resulting Y values block + /// Resulting Cb values block + /// Resulting Cr values block + public void Convert444(Span rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) { ref Rgb24 rgbStart = ref rgbSpan[0]; - for (int i = 0; i < 64; i++) + for (int i = 0; i < Block8x8F.Size; i++) { - ref Rgb24 c = ref Unsafe.Add(ref rgbStart, i); - - this.ConvertPixelInto( - c.R, - c.G, - c.B, - ref yBlock, - ref cbBlock, - ref crBlock, - i); + Rgb24 c = Unsafe.Add(ref rgbStart, i); + + yBlock[i] = this.CalculateY(c.R, c.G, c.B); + cbBlock[i] = this.CalculateCb(c.R, c.G, c.B); + crBlock[i] = this.CalculateCr(c.R, c.G, c.B); } } + /// + /// Converts Rgb24 pixels into YCbCr color space with 4:2:0 subsampling of luminance and chroma. + /// + /// Calculates 2 out of 4 luminance blocks and half of chroma blocks. This method must be called twice per 4x 8x8 DCT blocks with different row param. + /// Span of Rgb24 pixel data + /// First or "left" resulting Y block + /// Second or "right" resulting Y block + /// Resulting Cb values block + /// Resulting Cr values block + /// Row index of the 16x16 block, 0 or 1 + public void Convert420(Span rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row) + { + DebugGuard.MustBeBetweenOrEqualTo(row, 0, 1, nameof(row)); + + ref float yBlockLeftRef = ref Unsafe.As(ref yBlockLeft); + ref float yBlockRightRef = ref Unsafe.As(ref yBlockRight); + + // 0-31 or 32-63 + // upper or lower part + int chromaWriteOffset = row * (Block8x8F.Size / 2); + ref float cbBlockRef = ref Unsafe.Add(ref Unsafe.As(ref cbBlock), chromaWriteOffset); + ref float crBlockRef = ref Unsafe.Add(ref Unsafe.As(ref crBlock), chromaWriteOffset); + + ref Rgb24 rgbStart = ref rgbSpan[0]; + + for (int i = 0; i < 8; i += 2) + { + int yBlockWriteOffset = i * 8; + ref Rgb24 stride = ref Unsafe.Add(ref rgbStart, i * 16); + + int chromaOffset = 8 * (i / 2); + + // left + this.ConvertChunk420( + ref stride, + ref Unsafe.Add(ref yBlockLeftRef, yBlockWriteOffset), + ref Unsafe.Add(ref cbBlockRef, chromaOffset), + ref Unsafe.Add(ref crBlockRef, chromaOffset)); + + // right + this.ConvertChunk420( + ref Unsafe.Add(ref stride, 8), + ref Unsafe.Add(ref yBlockRightRef, yBlockWriteOffset), + ref Unsafe.Add(ref cbBlockRef, chromaOffset + 4), + ref Unsafe.Add(ref crBlockRef, chromaOffset + 4)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void ConvertChunk420(ref Rgb24 stride, ref float yBlock, ref float cbBlock, ref float crBlock) + { + // jpeg 8x8 blocks are processed as 16x16 blocks with 16x8 subpasses (this is done for performance reasons) + // each row is 16 pixels wide thus +16 stride reference offset + // resulting luminance (Y`) are sampled at original resolution thus +8 reference offset + for (int k = 0; k < 8; k += 2) + { + ref float yBlockRef = ref Unsafe.Add(ref yBlock, k); + + // top row + Rgb24 px0 = Unsafe.Add(ref stride, k); + Rgb24 px1 = Unsafe.Add(ref stride, k + 1); + yBlockRef = this.CalculateY(px0.R, px0.G, px0.B); + Unsafe.Add(ref yBlockRef, 1) = this.CalculateY(px1.R, px1.G, px1.B); + + // bottom row + Rgb24 px2 = Unsafe.Add(ref stride, k + 16); + Rgb24 px3 = Unsafe.Add(ref stride, k + 17); + Unsafe.Add(ref yBlockRef, 8) = this.CalculateY(px2.R, px2.G, px2.B); + Unsafe.Add(ref yBlockRef, 9) = this.CalculateY(px3.R, px3.G, px3.B); + + // chroma average for 2x2 pixel block + Unsafe.Add(ref cbBlock, k / 2) = this.CalculateAverageCb(px0, px1, px2, px3); + Unsafe.Add(ref crBlock, k / 2) = this.CalculateAverageCr(px0, px1, px2, px3); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private float CalculateAverageCb(Rgb24 px0, Rgb24 px1, Rgb24 px2, Rgb24 px3) + { + return 0.25f + * (this.CalculateCb(px0.R, px0.G, px0.B) + + this.CalculateCb(px1.R, px1.G, px1.B) + + this.CalculateCb(px2.R, px2.G, px2.B) + + this.CalculateCb(px3.R, px3.G, px3.B)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private float CalculateAverageCr(Rgb24 px0, Rgb24 px1, Rgb24 px2, Rgb24 px3) + { + return 0.25f + * (this.CalculateCr(px0.R, px0.G, px0.B) + + this.CalculateCr(px1.R, px1.G, px1.B) + + this.CalculateCr(px2.R, px2.G, px2.B) + + this.CalculateCr(px3.R, px3.G, px3.B)); + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int Fix(float x) => (int)((x * (1L << ScaleBits)) + 0.5F); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs index 209cc3c6ab..926e7d5a4a 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/RgbToYCbCrConverterVectorized.cs @@ -1,4 +1,4 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System; @@ -27,19 +27,45 @@ public static bool IsSupported } } + public static int AvxCompatibilityPadding + { + // rgb byte matrices contain 8 strides by 8 pixels each, thus 64 pixels total + // Strides are stored sequentially - one big span of 64 * 3 = 192 bytes + // Each stride has exactly 3 * 8 = 24 bytes or 3 * 8 * 8 = 192 bits + // Avx registers are 256 bits so rgb span will be loaded with extra 64 bits from the next stride: + // stride 0 0 - 192 -(+64bits)-> 256 + // stride 1 192 - 384 -(+64bits)-> 448 + // stride 2 384 - 576 -(+64bits)-> 640 + // stride 3 576 - 768 -(+64bits)-> 832 + // stride 4 768 - 960 -(+64bits)-> 1024 + // stride 5 960 - 1152 -(+64bits)-> 1216 + // stride 6 1152 - 1344 -(+64bits)-> 1408 + // stride 7 1344 - 1536 -(+64bits)-> 1600 <-- READ ACCESS VIOLATION + // + // Total size of the 64 pixel rgb span: 64 * 3 * 8 = 1536 bits, avx operations require 1600 bits + // This is not permitted - we are reading foreign memory + // + // 8 byte padding to rgb byte span will solve this problem without extra code in converters + get + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (IsSupported) + { + return 8; + } +#endif + return 0; + } + } + #if SUPPORTS_RUNTIME_INTRINSICS + private static ReadOnlySpan MoveFirst24BytesToSeparateLanes => new byte[] { 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; - private static ReadOnlySpan MoveLast24BytesToSeparateLanes => new byte[] - { - 2, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, - 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0, 1, 0, 0, 0 - }; - private static ReadOnlySpan ExtractRgb => new byte[] { 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 0xFF, 0xFF, 0xFF, 0xFF, @@ -47,7 +73,15 @@ public static bool IsSupported }; #endif - public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) + /// + /// Converts 8x8 Rgb24 pixel matrix to YCbCr pixel matrices with 4:4:4 subsampling + /// + /// Total size of rgb span must be 200 bytes + /// Span of rgb pixels with size of 64 + /// 8x8 destination matrix of Luminance(Y) converted data + /// 8x8 destination matrix of Chrominance(Cb) converted data + /// 8x8 destination matrix of Chrominance(Cr) converted data + public static void Convert444(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, ref Block8x8F cbBlock, ref Block8x8F crBlock) { Debug.Assert(IsSupported, "AVX2 is required to run this converter"); @@ -63,18 +97,20 @@ public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, re var f05 = Vector256.Create(0.5f); var zero = Vector256.Create(0).AsByte(); - ref Vector256 inRef = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); - ref Vector256 destYRef = ref Unsafe.As>(ref yBlock); - ref Vector256 destCbRef = ref Unsafe.As>(ref cbBlock); - ref Vector256 destCrRef = ref Unsafe.As>(ref crBlock); + ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + ref Vector256 destYRef = ref yBlock.V0; + ref Vector256 destCbRef = ref cbBlock.V0; + ref Vector256 destCrRef = ref crBlock.V0; var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); Vector256 rgb, rg, bx; Vector256 r, g, b; - for (int i = 0; i < 7; i++) + + const int bytesPerRgbStride = 24; + for (int i = 0; i < 8; i++) { - rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)(24 * i)).AsUInt32(), extractToLanesMask).AsByte(); + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * i)).AsUInt32(), extractToLanesMask).AsByte(); rgb = Avx2.Shuffle(rgb, extractRgbMask); @@ -94,27 +130,130 @@ public static void Convert(ReadOnlySpan rgbSpan, ref Block8x8F yBlock, re // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); } +#endif + } + + /// + /// Converts 16x8 Rgb24 pixels matrix to 2 Y 8x8 matrices with 4:2:0 subsampling + /// + public static void Convert420(ReadOnlySpan rgbSpan, ref Block8x8F yBlockLeft, ref Block8x8F yBlockRight, ref Block8x8F cbBlock, ref Block8x8F crBlock, int row) + { + Debug.Assert(IsSupported, "AVX2 is required to run this converter"); + +#if SUPPORTS_RUNTIME_INTRINSICS + var f0299 = Vector256.Create(0.299f); + var f0587 = Vector256.Create(0.587f); + var f0114 = Vector256.Create(0.114f); + var fn0168736 = Vector256.Create(-0.168736f); + var fn0331264 = Vector256.Create(-0.331264f); + var f128 = Vector256.Create(128f); + var fn0418688 = Vector256.Create(-0.418688f); + var fn0081312F = Vector256.Create(-0.081312F); + var f05 = Vector256.Create(0.5f); + var zero = Vector256.Create(0).AsByte(); + + ref Vector256 rgbByteSpan = ref Unsafe.As>(ref MemoryMarshal.GetReference(rgbSpan)); + + int destOffset = row * 4; - extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveLast24BytesToSeparateLanes)); - rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref inRef, (IntPtr)160).AsUInt32(), extractToLanesMask).AsByte(); - rgb = Avx2.Shuffle(rgb, extractRgbMask); + ref Vector256 destCbRef = ref Unsafe.Add(ref Unsafe.As>(ref cbBlock), destOffset); + ref Vector256 destCrRef = ref Unsafe.Add(ref Unsafe.As>(ref crBlock), destOffset); - rg = Avx2.UnpackLow(rgb, zero); - bx = Avx2.UnpackHigh(rgb, zero); + var extractToLanesMask = Unsafe.As>(ref MemoryMarshal.GetReference(MoveFirst24BytesToSeparateLanes)); + var extractRgbMask = Unsafe.As>(ref MemoryMarshal.GetReference(ExtractRgb)); + Vector256 rgb, rg, bx; + Vector256 r, g, b; + + Span> rDataLanes = stackalloc Vector256[4]; + Span> gDataLanes = stackalloc Vector256[4]; + Span> bDataLanes = stackalloc Vector256[4]; + + const int bytesPerRgbStride = 24; + for (int i = 0; i < 4; i++) + { + // 16x2 => 8x1 + // left 8x8 column conversions + for (int j = 0; j < 4; j += 2) + { + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte(); + + rgb = Avx2.Shuffle(rgb, extractRgbMask); + + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); + + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); + + int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1); + + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref yBlockLeft.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); + + rDataLanes[j] = r; + gDataLanes[j] = g; + bDataLanes[j] = b; + } + + // 16x2 => 8x1 + // right 8x8 column conversions + for (int j = 1; j < 4; j += 2) + { + rgb = Avx2.PermuteVar8x32(Unsafe.AddByteOffset(ref rgbByteSpan, (IntPtr)(bytesPerRgbStride * (i * 4 + j))).AsUInt32(), extractToLanesMask).AsByte(); + + rgb = Avx2.Shuffle(rgb, extractRgbMask); + + rg = Avx2.UnpackLow(rgb, zero); + bx = Avx2.UnpackHigh(rgb, zero); - r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); - g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); - b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); + r = Avx.ConvertToVector256Single(Avx2.UnpackLow(rg, zero).AsInt32()); + g = Avx.ConvertToVector256Single(Avx2.UnpackHigh(rg, zero).AsInt32()); + b = Avx.ConvertToVector256Single(Avx2.UnpackLow(bx, zero).AsInt32()); - // (0.299F * r) + (0.587F * g) + (0.114F * b); - Unsafe.Add(ref destYRef, 7) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); + int yBlockVerticalOffset = (i * 2) + ((j & 2) >> 1); - // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) - Unsafe.Add(ref destCbRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + // (0.299F * r) + (0.587F * g) + (0.114F * b); + Unsafe.Add(ref yBlockRight.V0, yBlockVerticalOffset) = SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f0114, b), f0587, g), f0299, r); - // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) - Unsafe.Add(ref destCrRef, 7) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + rDataLanes[j] = r; + gDataLanes[j] = g; + bDataLanes[j] = b; + } + + r = Scale16x2_8x1(rDataLanes); + g = Scale16x2_8x1(gDataLanes); + b = Scale16x2_8x1(bDataLanes); + + // 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)) + Unsafe.Add(ref destCbRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(f05, b), fn0331264, g), fn0168736, r)); + + // 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)) + Unsafe.Add(ref destCrRef, i) = Avx.Add(f128, SimdUtils.HwIntrinsics.MultiplyAdd(SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(fn0081312F, b), fn0418688, g), f05, r)); + } #endif } + +#if SUPPORTS_RUNTIME_INTRINSICS + /// + /// Scales 16x2 matrix to 8x1 using 2x2 average + /// + /// Input matrix consisting of 4 256bit vectors + /// 256bit vector containing upper and lower scaled parts of the input matrix + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static Vector256 Scale16x2_8x1(ReadOnlySpan> v) + { + Debug.Assert(Avx2.IsSupported, "AVX2 is required to run this converter"); + DebugGuard.IsTrue(v.Length == 4, "Input span must consist of 4 elements"); + + var f025 = Vector256.Create(0.25f); + + Vector256 left = Avx.Add(v[0], v[2]); + Vector256 right = Avx.Add(v[1], v[3]); + Vector256 avg2x2 = Avx.Multiply(Avx.HorizontalAdd(left, right), f025); + + return Avx2.Permute4x64(avg2x2.AsDouble(), 0b11_01_10_00).AsSingle(); + } +#endif } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs new file mode 100644 index 0000000000..a4abd532b3 --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter420{TPixel}.cs @@ -0,0 +1,121 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.Advanced; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + /// + /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks. + /// + /// The pixel type to work on + internal ref struct YCbCrForwardConverter420 + where TPixel : unmanaged, IPixel + { + /// + /// Number of pixels processed per single call + /// + private const int PixelsPerSample = 16 * 8; + + /// + /// Total byte size of processed pixels converted from TPixel to + /// + private const int RgbSpanByteSize = PixelsPerSample * 3; + + /// + /// of sampling area from given frame pixel buffer + /// + private static readonly Size SampleSize = new Size(16, 8); + + /// + /// The left Y component + /// + public Block8x8F YLeft; + + /// + /// The left Y component + /// + public Block8x8F YRight; + + /// + /// The Cb component + /// + public Block8x8F Cb; + + /// + /// The Cr component + /// + public Block8x8F Cr; + + /// + /// The color conversion tables + /// + private RgbToYCbCrConverterLut colorTables; + + /// + /// Temporal 16x8 block to hold TPixel data + /// + private Span pixelSpan; + + /// + /// Temporal RGB block + /// + private Span rgbSpan; + + /// + /// Sampled pixel buffer size + /// + private Size samplingAreaSize; + + /// + /// for internal operations + /// + private Configuration config; + + public YCbCrForwardConverter420(ImageFrame frame) + { + // matrices would be filled during convert calls + this.YLeft = default; + this.YRight = default; + this.Cb = default; + this.Cr = default; + + // temporal pixel buffers + this.pixelSpan = new TPixel[PixelsPerSample].AsSpan(); + this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan()); + + // frame data + this.samplingAreaSize = new Size(frame.Width, frame.Height); + this.config = frame.GetConfiguration(); + + // conversion vector fallback data + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + this.colorTables = RgbToYCbCrConverterLut.Create(); + } + else + { + this.colorTables = default; + } + } + + public void Convert(int x, int y, ref RowOctet currentRows, int idx) + { + YCbCrForwardConverter.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize); + + PixelOperations.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan); + + if (RgbToYCbCrConverterVectorized.IsSupported) + { + RgbToYCbCrConverterVectorized.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx); + } + else + { + this.colorTables.Convert420(this.rgbSpan, ref this.YLeft, ref this.YRight, ref this.Cb, ref this.Cr, idx); + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs new file mode 100644 index 0000000000..ef589272bd --- /dev/null +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter444{TPixel}.cs @@ -0,0 +1,122 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.Advanced; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder +{ + /// + /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks. + /// + /// The pixel type to work on + internal ref struct YCbCrForwardConverter444 + where TPixel : unmanaged, IPixel + { + /// + /// Number of pixels processed per single call + /// + private const int PixelsPerSample = 8 * 8; + + /// + /// Total byte size of processed pixels converted from TPixel to + /// + private const int RgbSpanByteSize = PixelsPerSample * 3; + + /// + /// of sampling area from given frame pixel buffer + /// + private static readonly Size SampleSize = new Size(8, 8); + + /// + /// The Y component + /// + public Block8x8F Y; + + /// + /// The Cb component + /// + public Block8x8F Cb; + + /// + /// The Cr component + /// + public Block8x8F Cr; + + /// + /// The color conversion tables + /// + private RgbToYCbCrConverterLut colorTables; + + /// + /// Temporal 64-byte span to hold unconverted TPixel data + /// + private Span pixelSpan; + + /// + /// Temporal 64-byte span to hold converted Rgb24 data + /// + private Span rgbSpan; + + /// + /// Sampled pixel buffer size + /// + private Size samplingAreaSize; + + /// + /// for internal operations + /// + private Configuration config; + + public YCbCrForwardConverter444(ImageFrame frame) + { + // matrices would be filled during convert calls + this.Y = default; + this.Cb = default; + this.Cr = default; + + // temporal pixel buffers + this.pixelSpan = new TPixel[PixelsPerSample].AsSpan(); + this.rgbSpan = MemoryMarshal.Cast(new byte[RgbSpanByteSize + RgbToYCbCrConverterVectorized.AvxCompatibilityPadding].AsSpan()); + + // frame data + this.samplingAreaSize = new Size(frame.Width, frame.Height); + this.config = frame.GetConfiguration(); + + // conversion vector fallback data + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + this.colorTables = RgbToYCbCrConverterLut.Create(); + } + else + { + this.colorTables = default; + } + } + + /// + /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (, , ) + /// + public void Convert(int x, int y, ref RowOctet currentRows) + { + YCbCrForwardConverter.LoadAndStretchEdges(currentRows, this.pixelSpan, new Point(x, y), SampleSize, this.samplingAreaSize); + + PixelOperations.Instance.ToRgb24(this.config, this.pixelSpan, this.rgbSpan); + + ref Block8x8F yBlock = ref this.Y; + ref Block8x8F cbBlock = ref this.Cb; + ref Block8x8F crBlock = ref this.Cr; + + if (RgbToYCbCrConverterVectorized.IsSupported) + { + RgbToYCbCrConverterVectorized.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + } + else + { + this.colorTables.Convert444(this.rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + } + } + } +} diff --git a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs index 81e64b277b..f5ef770914 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/YCbCrForwardConverter{TPixel}.cs @@ -2,81 +2,59 @@ // Licensed under the Apache License, Version 2.0. using System; -using SixLabors.ImageSharp.Advanced; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder { - /// - /// On-stack worker struct to efficiently encapsulate the TPixel -> Rgb24 -> YCbCr conversion chain of 8x8 pixel blocks. - /// - /// The pixel type to work on - internal ref struct YCbCrForwardConverter + internal static class YCbCrForwardConverter where TPixel : unmanaged, IPixel { - /// - /// The Y component - /// - public Block8x8F Y; - - /// - /// The Cb component - /// - public Block8x8F Cb; - - /// - /// The Cr component - /// - public Block8x8F Cr; + public static void LoadAndStretchEdges(RowOctet source, Span dest, Point start, Size sampleSize, Size totalSize) + { + DebugGuard.MustBeBetweenOrEqualTo(start.X, 1, totalSize.Width - 1, nameof(start.X)); + DebugGuard.MustBeBetweenOrEqualTo(start.Y, 1, totalSize.Height - 1, nameof(start.Y)); - /// - /// The color conversion tables - /// - private RgbToYCbCrConverterLut colorTables; + int width = Math.Min(sampleSize.Width, totalSize.Width - start.X); + int height = Math.Min(sampleSize.Height, totalSize.Height - start.Y); - /// - /// Temporal 8x8 block to hold TPixel data - /// - private GenericBlock8x8 pixelBlock; + uint byteWidth = (uint)(width * Unsafe.SizeOf()); + int remainderXCount = sampleSize.Width - width; - /// - /// Temporal RGB block - /// - private GenericBlock8x8 rgbBlock; + ref byte blockStart = ref MemoryMarshal.GetReference(MemoryMarshal.Cast(dest)); + int rowSizeInBytes = sampleSize.Width * Unsafe.SizeOf(); - public static YCbCrForwardConverter Create() - { - var result = default(YCbCrForwardConverter); - if (!RgbToYCbCrConverterVectorized.IsSupported) + for (int y = 0; y < height; y++) { - // Avoid creating lookup tables, when vectorized converter is supported - result.colorTables = RgbToYCbCrConverterLut.Create(); - } + Span row = source[y]; - return result; - } + ref byte s = ref Unsafe.As(ref row[start.X]); + ref byte d = ref Unsafe.Add(ref blockStart, y * rowSizeInBytes); - /// - /// Converts a 8x8 image area inside 'pixels' at position (x,y) placing the result members of the structure (, , ) - /// - public void Convert(ImageFrame frame, int x, int y, ref RowOctet currentRows) - { - this.pixelBlock.LoadAndStretchEdges(frame.PixelBuffer, x, y, ref currentRows); + Unsafe.CopyBlock(ref d, ref s, byteWidth); + + ref TPixel last = ref Unsafe.Add(ref Unsafe.As(ref d), width - 1); - Span rgbSpan = this.rgbBlock.AsSpanUnsafe(); - PixelOperations.Instance.ToRgb24(frame.GetConfiguration(), this.pixelBlock.AsSpanUnsafe(), rgbSpan); + for (int x = 1; x <= remainderXCount; x++) + { + Unsafe.Add(ref last, x) = last; + } + } - ref Block8x8F yBlock = ref this.Y; - ref Block8x8F cbBlock = ref this.Cb; - ref Block8x8F crBlock = ref this.Cr; + int remainderYCount = sampleSize.Height - height; - if (RgbToYCbCrConverterVectorized.IsSupported) + if (remainderYCount == 0) { - RgbToYCbCrConverterVectorized.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + return; } - else + + ref byte lastRowStart = ref Unsafe.Add(ref blockStart, (height - 1) * rowSizeInBytes); + + for (int y = 1; y <= remainderYCount; y++) { - this.colorTables.Convert(rgbSpan, ref yBlock, ref cbBlock, ref crBlock); + ref byte remStart = ref Unsafe.Add(ref lastRowStart, rowSizeInBytes * y); + Unsafe.CopyBlock(ref remStart, ref lastRowStart, (uint)rowSizeInBytes); } } } diff --git a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs index a6d0622dd8..f31d07efca 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/FastFloatingPointDCT.cs @@ -1,8 +1,13 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Jpeg.Components @@ -10,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components /// /// Contains inaccurate, but fast forward and inverse DCT implementations. /// - internal static class FastFloatingPointDCT + internal static partial class FastFloatingPointDCT { #pragma warning disable SA1310 // FieldNamesMustNotContainUnderscore private const float C_1_175876 = 1.175875602f; @@ -38,147 +43,31 @@ internal static class FastFloatingPointDCT private const float C_0_765367 = 0.765366865f; private const float C_0_125 = 0.1250f; + +#if SUPPORTS_RUNTIME_INTRINSICS + private static readonly Vector256 C_V_0_5411 = Vector256.Create(0.541196f); + private static readonly Vector256 C_V_1_3065 = Vector256.Create(1.306563f); + private static readonly Vector256 C_V_1_1758 = Vector256.Create(1.175876f); + private static readonly Vector256 C_V_0_7856 = Vector256.Create(0.785695f); + private static readonly Vector256 C_V_1_3870 = Vector256.Create(1.387040f); + private static readonly Vector256 C_V_0_2758 = Vector256.Create(0.275899f); + + private static readonly Vector256 C_V_n1_9615 = Vector256.Create(-1.961570560f); + private static readonly Vector256 C_V_n0_3901 = Vector256.Create(-0.390180644f); + private static readonly Vector256 C_V_n0_8999 = Vector256.Create(-0.899976223f); + private static readonly Vector256 C_V_n2_5629 = Vector256.Create(-2.562915447f); + private static readonly Vector256 C_V_0_2986 = Vector256.Create(0.298631336f); + private static readonly Vector256 C_V_2_0531 = Vector256.Create(2.053119869f); + private static readonly Vector256 C_V_3_0727 = Vector256.Create(3.072711026f); + private static readonly Vector256 C_V_1_5013 = Vector256.Create(1.501321110f); + private static readonly Vector256 C_V_n1_8477 = Vector256.Create(-1.847759065f); + private static readonly Vector256 C_V_0_7653 = Vector256.Create(0.765366865f); + + private static Vector256 C_V_InvSqrt2 = Vector256.Create(0.707107f); +#endif #pragma warning restore SA1310 // FieldNamesMustNotContainUnderscore private static readonly Vector4 InvSqrt2 = new Vector4(0.707107f); - /// - /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization). - /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239 - /// - /// Source - /// Destination - /// Temporary block provided by the caller - public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp) - { - src.TransposeInto(ref temp); - - IDCT8x4_LeftPart(ref temp, ref dest); - IDCT8x4_RightPart(ref temp, ref dest); - - dest.TransposeInto(ref temp); - - IDCT8x4_LeftPart(ref temp, ref dest); - IDCT8x4_RightPart(ref temp, ref dest); - - // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? - dest.MultiplyInPlace(C_0_125); - } - - /// - /// Do IDCT internal operations on the left part of the block. Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// Destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1L; - Vector4 my7 = s.V7L; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3L; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5L; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2L; - Vector4 my6 = s.V6L; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0L; - Vector4 my4 = s.V4L; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0L = my0 + mb0; - d.V7L = my0 - mb0; - d.V1L = my1 + mb1; - d.V6L = my1 - mb1; - d.V2L = my2 + mb2; - d.V5L = my2 - mb2; - d.V3L = my3 + mb3; - d.V4L = my3 - mb3; - } - - /// - /// Do IDCT internal operations on the right part of the block. - /// Original src: - /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 - /// - /// The source block - /// The destination block - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) - { - Vector4 my1 = s.V1R; - Vector4 my7 = s.V7R; - Vector4 mz0 = my1 + my7; - - Vector4 my3 = s.V3R; - Vector4 mz2 = my3 + my7; - Vector4 my5 = s.V5R; - Vector4 mz1 = my3 + my5; - Vector4 mz3 = my1 + my5; - - Vector4 mz4 = (mz0 + mz1) * C_1_175876; - - mz2 = (mz2 * C_1_961571) + mz4; - mz3 = (mz3 * C_0_390181) + mz4; - mz0 = mz0 * C_0_899976; - mz1 = mz1 * C_2_562915; - - Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; - Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; - Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; - Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; - - Vector4 my2 = s.V2R; - Vector4 my6 = s.V6R; - mz4 = (my2 + my6) * C_0_541196; - Vector4 my0 = s.V0R; - Vector4 my4 = s.V4R; - mz0 = my0 + my4; - mz1 = my0 - my4; - - mz2 = mz4 + (my6 * C_1_847759); - mz3 = mz4 + (my2 * C_0_765367); - - my0 = mz0 + mz3; - my3 = mz0 - mz3; - my1 = mz1 + mz2; - my2 = mz1 - mz2; - - d.V0R = my0 + mb0; - d.V7R = my0 - mb0; - d.V1R = my1 + mb1; - d.V6R = my1 - mb1; - d.V2R = my2 + mb2; - d.V5R = my2 - mb2; - d.V3R = my3 + mb3; - d.V4R = my3 - mb3; - } - /// /// Original: /// @@ -309,11 +198,84 @@ public static void FDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) } /// - /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization) + /// Combined operation of and + /// using AVX commands. + /// + /// Source + /// Destination + public static void FDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); + + Vector256 t0 = Avx.Add(s.V0, s.V7); + Vector256 t7 = Avx.Subtract(s.V0, s.V7); + Vector256 t1 = Avx.Add(s.V1, s.V6); + Vector256 t6 = Avx.Subtract(s.V1, s.V6); + Vector256 t2 = Avx.Add(s.V2, s.V5); + Vector256 t5 = Avx.Subtract(s.V2, s.V5); + Vector256 t3 = Avx.Add(s.V3, s.V4); + Vector256 t4 = Avx.Subtract(s.V3, s.V4); + + Vector256 c0 = Avx.Add(t0, t3); + Vector256 c1 = Avx.Add(t1, t2); + + // 0 4 + d.V0 = Avx.Add(c0, c1); + d.V4 = Avx.Subtract(c0, c1); + + Vector256 c3 = Avx.Subtract(t0, t3); + Vector256 c2 = Avx.Subtract(t1, t2); + + // 2 6 + d.V2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(c2, C_V_0_5411), c3, C_V_1_3065); + d.V6 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(c2, C_V_1_3065), c3, C_V_0_5411); + + c3 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t4, C_V_1_1758), t7, C_V_0_7856); + c0 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(t4, C_V_0_7856), t7, C_V_1_1758); + + c2 = SimdUtils.HwIntrinsics.MultiplyAdd(Avx.Multiply(t5, C_V_1_3870), C_V_0_2758, t6); + c1 = SimdUtils.HwIntrinsics.MultiplySubstract(Avx.Multiply(C_V_0_2758, t5), t6, C_V_1_3870); + + // 3 5 + d.V3 = Avx.Subtract(c0, c2); + d.V5 = Avx.Subtract(c3, c1); + + c0 = Avx.Multiply(Avx.Add(c0, c2), C_V_InvSqrt2); + c3 = Avx.Multiply(Avx.Add(c3, c1), C_V_InvSqrt2); + + // 1 7 + d.V1 = Avx.Add(c0, c3); + d.V7 = Avx.Subtract(c0, c3); +#endif + } + + /// + /// Performs 8x8 matrix Forward Discrete Cosine Transform + /// + /// Source + /// Destination + public static void FDCT8x8(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + FDCT8x8_Avx(ref s, ref d); + } + else +#endif + { + FDCT8x4_LeftPart(ref s, ref d); + FDCT8x4_RightPart(ref s, ref d); + } + } + + /// + /// Apply floating point FDCT from src into dest /// /// Source /// Destination - /// Temporary block provided by the caller + /// Temporary block provided by the caller for optimization /// If true, a constant -128.0 offset is applied for all values before FDCT public static void TransformFDCT( ref Block8x8F src, @@ -327,14 +289,225 @@ public static void TransformFDCT( temp.AddInPlace(-128F); } - FDCT8x4_LeftPart(ref temp, ref dest); - FDCT8x4_RightPart(ref temp, ref dest); + FDCT8x8(ref temp, ref dest); dest.TransposeInto(ref temp); - FDCT8x4_LeftPart(ref temp, ref dest); - FDCT8x4_RightPart(ref temp, ref dest); + FDCT8x8(ref temp, ref dest); + + dest.MultiplyInPlace(C_0_125); + } + + /// + /// Performs 8x8 matrix Inverse Discrete Cosine Transform + /// + /// Source + /// Destination + public static void IDCT8x8(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx.IsSupported) + { + IDCT8x8_Avx(ref s, ref d); + } + else +#endif + { + IDCT8x4_LeftPart(ref s, ref d); + IDCT8x4_RightPart(ref s, ref d); + } + } + + /// + /// Do IDCT internal operations on the left part of the block. Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// Destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_LeftPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1L; + Vector4 my7 = s.V7L; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3L; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5L; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2L; + Vector4 my6 = s.V6L; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0L; + Vector4 my4 = s.V4L; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0L = my0 + mb0; + d.V7L = my0 - mb0; + d.V1L = my1 + mb1; + d.V6L = my1 - mb1; + d.V2L = my2 + mb2; + d.V5L = my2 - mb2; + d.V3L = my3 + mb3; + d.V4L = my3 - mb3; + } + + /// + /// Do IDCT internal operations on the right part of the block. + /// Original src: + /// https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L261 + /// + /// The source block + /// The destination block + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void IDCT8x4_RightPart(ref Block8x8F s, ref Block8x8F d) + { + Vector4 my1 = s.V1R; + Vector4 my7 = s.V7R; + Vector4 mz0 = my1 + my7; + + Vector4 my3 = s.V3R; + Vector4 mz2 = my3 + my7; + Vector4 my5 = s.V5R; + Vector4 mz1 = my3 + my5; + Vector4 mz3 = my1 + my5; + + Vector4 mz4 = (mz0 + mz1) * C_1_175876; + + mz2 = (mz2 * C_1_961571) + mz4; + mz3 = (mz3 * C_0_390181) + mz4; + mz0 = mz0 * C_0_899976; + mz1 = mz1 * C_2_562915; + + Vector4 mb3 = (my7 * C_0_298631) + mz0 + mz2; + Vector4 mb2 = (my5 * C_2_053120) + mz1 + mz3; + Vector4 mb1 = (my3 * C_3_072711) + mz1 + mz2; + Vector4 mb0 = (my1 * C_1_501321) + mz0 + mz3; + + Vector4 my2 = s.V2R; + Vector4 my6 = s.V6R; + mz4 = (my2 + my6) * C_0_541196; + Vector4 my0 = s.V0R; + Vector4 my4 = s.V4R; + mz0 = my0 + my4; + mz1 = my0 - my4; + + mz2 = mz4 + (my6 * C_1_847759); + mz3 = mz4 + (my2 * C_0_765367); + + my0 = mz0 + mz3; + my3 = mz0 - mz3; + my1 = mz1 + mz2; + my2 = mz1 - mz2; + + d.V0R = my0 + mb0; + d.V7R = my0 - mb0; + d.V1R = my1 + mb1; + d.V6R = my1 - mb1; + d.V2R = my2 + mb2; + d.V5R = my2 - mb2; + d.V3R = my3 + mb3; + d.V4R = my3 - mb3; + } + + /// + /// Combined operation of and + /// using AVX commands. + /// + /// Source + /// Destination + public static void IDCT8x8_Avx(ref Block8x8F s, ref Block8x8F d) + { +#if SUPPORTS_RUNTIME_INTRINSICS + Debug.Assert(Avx.IsSupported, "AVX is required to execute this method"); + + Vector256 my1 = s.V1; + Vector256 my7 = s.V7; + Vector256 mz0 = Avx.Add(my1, my7); + + Vector256 my3 = s.V3; + Vector256 mz2 = Avx.Add(my3, my7); + Vector256 my5 = s.V5; + Vector256 mz1 = Avx.Add(my3, my5); + Vector256 mz3 = Avx.Add(my1, my5); + + Vector256 mz4 = Avx.Multiply(Avx.Add(mz0, mz1), C_V_1_1758); + + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz2, C_V_n1_9615); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, mz3, C_V_n0_3901); + mz0 = Avx.Multiply(mz0, C_V_n0_8999); + mz1 = Avx.Multiply(mz1, C_V_n2_5629); + + Vector256 mb3 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my7, C_V_0_2986), mz2); + Vector256 mb2 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my5, C_V_2_0531), mz3); + Vector256 mb1 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz1, my3, C_V_3_0727), mz2); + Vector256 mb0 = Avx.Add(SimdUtils.HwIntrinsics.MultiplyAdd(mz0, my1, C_V_1_5013), mz3); + + Vector256 my2 = s.V2; + Vector256 my6 = s.V6; + mz4 = Avx.Multiply(Avx.Add(my2, my6), C_V_0_5411); + Vector256 my0 = s.V0; + Vector256 my4 = s.V4; + mz0 = Avx.Add(my0, my4); + mz1 = Avx.Subtract(my0, my4); + mz2 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my6, C_V_n1_8477); + mz3 = SimdUtils.HwIntrinsics.MultiplyAdd(mz4, my2, C_V_0_7653); + + my0 = Avx.Add(mz0, mz3); + my3 = Avx.Subtract(mz0, mz3); + my1 = Avx.Add(mz1, mz2); + my2 = Avx.Subtract(mz1, mz2); + + d.V0 = Avx.Add(my0, mb0); + d.V7 = Avx.Subtract(my0, mb0); + d.V1 = Avx.Add(my1, mb1); + d.V6 = Avx.Subtract(my1, mb1); + d.V2 = Avx.Add(my2, mb2); + d.V5 = Avx.Subtract(my2, mb2); + d.V3 = Avx.Add(my3, mb3); + d.V4 = Avx.Subtract(my3, mb3); +#endif + } + + /// + /// Apply floating point IDCT transformation into dest, using a temporary block 'temp' provided by the caller (optimization). + /// Ported from https://github.com/norishigefukushima/dct_simd/blob/master/dct/dct8x8_simd.cpp#L239 + /// + /// Source + /// Destination + /// Temporary block provided by the caller + public static void TransformIDCT(ref Block8x8F src, ref Block8x8F dest, ref Block8x8F temp) + { + src.TransposeInto(ref temp); + + IDCT8x8(ref temp, ref dest); + dest.TransposeInto(ref temp); + IDCT8x8(ref temp, ref dest); + + // TODO: What if we leave the blocks in a scaled-by-x8 state until final color packing? dest.MultiplyInPlace(C_0_125); } } diff --git a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs index f5dc1c79fe..6020e6196c 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegEncoderCore.cs @@ -5,14 +5,11 @@ using System.Buffers.Binary; using System.IO; using System.Linq; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using System.Threading; using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder; using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; -using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.Metadata; using SixLabors.ImageSharp.Metadata.Profiles.Exif; using SixLabors.ImageSharp.Metadata.Profiles.Icc; @@ -32,20 +29,47 @@ internal sealed unsafe class JpegEncoderCore : IImageEncoderInternals private const int QuantizationTableCount = 2; /// - /// A scratch buffer to reduce allocations. + /// Gets the unscaled quantization tables in zig-zag order. Each + /// encoder copies and scales the tables according to its quality parameter. + /// The values are derived from section K.1 after converting from natural to + /// zig-zag order. /// - private readonly byte[] buffer = new byte[20]; + // The C# compiler emits this as a compile-time constant embedded in the PE file. + // This is effectively compiled down to: return new ReadOnlySpan(&data, length) + // More details can be found: https://github.com/dotnet/roslyn/pull/24621 + private static ReadOnlySpan UnscaledQuant_Luminance => new byte[] + { + // Luminance. + 16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24, + 40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60, + 57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80, + 109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112, + 100, 120, 92, 101, 103, 99, + }; /// - /// A buffer for reducing the number of stream writes when emitting Huffman tables. 64 seems to be enough. + /// Gets the unscaled quantization tables in zig-zag order. Each + /// encoder copies and scales the tables according to its quality parameter. + /// The values are derived from section K.1 after converting from natural to + /// zig-zag order. /// - private readonly byte[] emitBuffer = new byte[64]; + // The C# compiler emits this as a compile-time constant embedded in the PE file. + // This is effectively compiled down to: return new ReadOnlySpan(&data, length) + // More details can be found: https://github.com/dotnet/roslyn/pull/24621 + private static ReadOnlySpan UnscaledQuant_Chrominance => new byte[] + { + // Chrominance. + 17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66, + 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, + 99, 99, 99, 99, 99, 99, 99, 99, + }; /// - /// A buffer for reducing the number of stream writes when emitting Huffman tables. Max combined table lengths + - /// identifier. + /// A scratch buffer to reduce allocations. /// - private readonly byte[] huffmanBuffer = new byte[179]; + private readonly byte[] buffer = new byte[20]; /// /// Gets or sets the subsampling method to use. @@ -62,26 +86,6 @@ internal sealed unsafe class JpegEncoderCore : IImageEncoderInternals /// private readonly JpegColorType? colorType; - /// - /// The accumulated bits to write to the stream. - /// - private uint accumulatedBits; - - /// - /// The accumulated bit count. - /// - private uint bitCount; - - /// - /// The scaled chrominance table, in zig-zag order. - /// - private Block8x8F chrominanceQuantTable; - - /// - /// The scaled luminance table, in zig-zag order. - /// - private Block8x8F luminanceQuantTable; - /// /// The output stream. All attempted writes after the first error become no-ops. /// @@ -98,67 +102,6 @@ public JpegEncoderCore(IJpegEncoderOptions options) this.colorType = options.ColorType; } - /// - /// Gets the counts the number of bits needed to hold an integer. - /// - // The C# compiler emits this as a compile-time constant embedded in the PE file. - // This is effectively compiled down to: return new ReadOnlySpan(&data, length) - // More details can be found: https://github.com/dotnet/roslyn/pull/24621 - private static ReadOnlySpan BitCountLut => new byte[] - { - 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, - }; - - /// - /// Gets the unscaled quantization tables in zig-zag order. Each - /// encoder copies and scales the tables according to its quality parameter. - /// The values are derived from section K.1 after converting from natural to - /// zig-zag order. - /// - // The C# compiler emits this as a compile-time constant embedded in the PE file. - // This is effectively compiled down to: return new ReadOnlySpan(&data, length) - // More details can be found: https://github.com/dotnet/roslyn/pull/24621 - private static ReadOnlySpan UnscaledQuant_Luminance => new byte[] - { - // Luminance. - 16, 11, 12, 14, 12, 10, 16, 14, 13, 14, 18, 17, 16, 19, 24, - 40, 26, 24, 22, 22, 24, 49, 35, 37, 29, 40, 58, 51, 61, 60, - 57, 51, 56, 55, 64, 72, 92, 78, 64, 68, 87, 69, 55, 56, 80, - 109, 81, 87, 95, 98, 103, 104, 103, 62, 77, 113, 121, 112, - 100, 120, 92, 101, 103, 99, - }; - - /// - /// Gets the unscaled quantization tables in zig-zag order. Each - /// encoder copies and scales the tables according to its quality parameter. - /// The values are derived from section K.1 after converting from natural to - /// zig-zag order. - /// - // The C# compiler emits this as a compile-time constant embedded in the PE file. - // This is effectively compiled down to: return new ReadOnlySpan(&data, length) - // More details can be found: https://github.com/dotnet/roslyn/pull/24621 - private static ReadOnlySpan UnscaledQuant_Chrominance => new byte[] - { - // Chrominance. - 17, 18, 18, 24, 21, 24, 47, 26, 26, 47, 99, 66, 56, 66, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, - 99, 99, 99, 99, 99, 99, 99, 99, - }; - /// /// Encode writes the image to the jpeg baseline format with the given options. /// @@ -171,14 +114,14 @@ public void Encode(Image image, Stream stream, CancellationToken { Guard.NotNull(image, nameof(image)); Guard.NotNull(stream, nameof(stream)); - cancellationToken.ThrowIfCancellationRequested(); - const ushort max = JpegConstants.MaxLength; - if (image.Width >= max || image.Height >= max) + if (image.Width >= JpegConstants.MaxLength || image.Height >= JpegConstants.MaxLength) { - throw new ImageFormatException($"Image is too large to encode at {image.Width}x{image.Height}."); + JpegThrowHelper.ThrowDimensionsTooLarge(image.Width, image.Height); } + cancellationToken.ThrowIfCancellationRequested(); + this.outputStream = stream; ImageMetadata metadata = image.Metadata; @@ -201,10 +144,14 @@ public void Encode(Image image, Stream stream, CancellationToken } // Initialize the quantization tables. - InitQuantizationTable(0, scale, ref this.luminanceQuantTable); + // TODO: This looks ugly, should we write chrominance table for luminance-only images? + // If not - this can code can be simplified + Block8x8F luminanceQuantTable = default; + Block8x8F chrominanceQuantTable = default; + InitQuantizationTable(0, scale, ref luminanceQuantTable); if (componentCount > 1) { - InitQuantizationTable(1, scale, ref this.chrominanceQuantTable); + InitQuantizationTable(1, scale, ref chrominanceQuantTable); } // Write the Start Of Image marker. @@ -214,7 +161,7 @@ public void Encode(Image image, Stream stream, CancellationToken this.WriteProfiles(metadata); // Write the quantization tables. - this.WriteDefineQuantizationTables(); + this.WriteDefineQuantizationTables(ref luminanceQuantTable, ref chrominanceQuantTable); // Write the image dimensions. this.WriteStartOfFrame(image.Width, image.Height, componentCount); @@ -222,13 +169,31 @@ public void Encode(Image image, Stream stream, CancellationToken // Write the Huffman tables. this.WriteDefineHuffmanTables(componentCount); - // Write the image data. + // Write the scan header. this.WriteStartOfScan(image, componentCount, cancellationToken); + // Write the scan compressed data. + var scanEncoder = new HuffmanScanEncoder(stream); + if (this.colorType == JpegColorType.Luminance) + { + scanEncoder.EncodeGrayscale(image, ref luminanceQuantTable, cancellationToken); + } + else + { + switch (subsample) + { + case JpegSubsample.Ratio444: + scanEncoder.Encode444(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); + break; + case JpegSubsample.Ratio420: + scanEncoder.Encode420(image, ref luminanceQuantTable, ref chrominanceQuantTable, cancellationToken); + break; + } + } + // Write the End Of Image marker. - this.buffer[0] = JpegConstants.Markers.XFF; - this.buffer[1] = JpegConstants.Markers.EOI; - stream.Write(this.buffer, 0, 2); + this.WriteEndOfImageMarker(); + stream.Flush(); } @@ -248,248 +213,6 @@ private static void WriteDataToDqt(byte[] dqt, ref int offset, QuantIndex i, ref } } - /// - /// Initializes quantization table. - /// - /// The quantization index. - /// The scaling factor. - /// The quantization table. - private static void InitQuantizationTable(int i, int scale, ref Block8x8F quant) - { - DebugGuard.MustBeBetweenOrEqualTo(i, 0, 1, nameof(i)); - ReadOnlySpan unscaledQuant = (i == 0) ? UnscaledQuant_Luminance : UnscaledQuant_Chrominance; - - for (int j = 0; j < Block8x8F.Size; j++) - { - int x = unscaledQuant[j]; - x = ((x * scale) + 50) / 100; - if (x < 1) - { - x = 1; - } - - if (x > 255) - { - x = 255; - } - - quant[j] = x; - } - } - - /// - /// Emits the least significant count of bits of bits to the bit-stream. - /// The precondition is bits - /// - /// < 1<<nBits && nBits <= 16 - /// - /// . - /// - /// The packed bits. - /// The number of bits - /// The reference to the emitBuffer. - [MethodImpl(InliningOptions.ShortMethod)] - private void Emit(uint bits, uint count, ref byte emitBufferBase) - { - count += this.bitCount; - bits <<= (int)(32 - count); - bits |= this.accumulatedBits; - - // Only write if more than 8 bits. - if (count >= 8) - { - // Track length - int len = 0; - while (count >= 8) - { - byte b = (byte)(bits >> 24); - Unsafe.Add(ref emitBufferBase, len++) = b; - if (b == byte.MaxValue) - { - Unsafe.Add(ref emitBufferBase, len++) = byte.MinValue; - } - - bits <<= 8; - count -= 8; - } - - if (len > 0) - { - this.outputStream.Write(this.emitBuffer, 0, len); - } - } - - this.accumulatedBits = bits; - this.bitCount = count; - } - - /// - /// Emits the given value with the given Huffman encoder. - /// - /// The index of the Huffman encoder - /// The value to encode. - /// The reference to the emit buffer. - [MethodImpl(InliningOptions.ShortMethod)] - private void EmitHuff(HuffIndex index, int value, ref byte emitBufferBase) - { - uint x = HuffmanLut.TheHuffmanLut[(int)index].Values[value]; - this.Emit(x & ((1 << 24) - 1), x >> 24, ref emitBufferBase); - } - - /// - /// Emits a run of runLength copies of value encoded with the given Huffman encoder. - /// - /// The index of the Huffman encoder - /// The number of copies to encode. - /// The value to encode. - /// The reference to the emit buffer. - [MethodImpl(InliningOptions.ShortMethod)] - private void EmitHuffRLE(HuffIndex index, int runLength, int value, ref byte emitBufferBase) - { - int a = value; - int b = value; - if (a < 0) - { - a = -value; - b = value - 1; - } - - uint bt; - if (a < 0x100) - { - bt = BitCountLut[a]; - } - else - { - bt = 8 + (uint)BitCountLut[a >> 8]; - } - - this.EmitHuff(index, (int)((uint)(runLength << 4) | bt), ref emitBufferBase); - if (bt > 0) - { - this.Emit((uint)b & (uint)((1 << ((int)bt)) - 1), bt, ref emitBufferBase); - } - } - - /// - /// Encodes the image with no subsampling. - /// - /// The pixel format. - /// The pixel accessor providing access to the image pixels. - /// The token to monitor for cancellation. - /// The reference to the emit buffer. - private void Encode444(Image pixels, CancellationToken cancellationToken, ref byte emitBufferBase) - where TPixel : unmanaged, IPixel - { - // TODO: Need a JpegScanEncoder class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.) - // (Partially done with YCbCrForwardConverter) - Block8x8F temp1 = default; - Block8x8F temp2 = default; - - Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable; - Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable; - - var unzig = ZigZag.CreateUnzigTable(); - - // ReSharper disable once InconsistentNaming - int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; - - var pixelConverter = YCbCrForwardConverter.Create(); - ImageFrame frame = pixels.Frames.RootFrame; - Buffer2D pixelBuffer = frame.PixelBuffer; - RowOctet currentRows = default; - - for (int y = 0; y < pixels.Height; y += 8) - { - cancellationToken.ThrowIfCancellationRequested(); - currentRows.Update(pixelBuffer, y); - - for (int x = 0; x < pixels.Width; x += 8) - { - pixelConverter.Convert(frame, x, y, ref currentRows); - - prevDCY = this.WriteBlock( - QuantIndex.Luminance, - prevDCY, - ref pixelConverter.Y, - ref temp1, - ref temp2, - ref onStackLuminanceQuantTable, - ref unzig, - ref emitBufferBase); - - prevDCCb = this.WriteBlock( - QuantIndex.Chrominance, - prevDCCb, - ref pixelConverter.Cb, - ref temp1, - ref temp2, - ref onStackChrominanceQuantTable, - ref unzig, - ref emitBufferBase); - - prevDCCr = this.WriteBlock( - QuantIndex.Chrominance, - prevDCCr, - ref pixelConverter.Cr, - ref temp1, - ref temp2, - ref onStackChrominanceQuantTable, - ref unzig, - ref emitBufferBase); - } - } - } - - /// - /// Encodes the image with no chroma, just luminance. - /// - /// The pixel format. - /// The pixel accessor providing access to the image pixels. - /// The token to monitor for cancellation. - /// The reference to the emit buffer. - private void EncodeGrayscale(Image pixels, CancellationToken cancellationToken, ref byte emitBufferBase) - where TPixel : unmanaged, IPixel - { - // TODO: Need a JpegScanEncoder class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.) - // (Partially done with YCbCrForwardConverter) - Block8x8F temp1 = default; - Block8x8F temp2 = default; - - Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable; - - var unzig = ZigZag.CreateUnzigTable(); - - // ReSharper disable once InconsistentNaming - int prevDCY = 0; - - var pixelConverter = LuminanceForwardConverter.Create(); - ImageFrame frame = pixels.Frames.RootFrame; - Buffer2D pixelBuffer = frame.PixelBuffer; - RowOctet currentRows = default; - - for (int y = 0; y < pixels.Height; y += 8) - { - cancellationToken.ThrowIfCancellationRequested(); - currentRows.Update(pixelBuffer, y); - - for (int x = 0; x < pixels.Width; x += 8) - { - pixelConverter.Convert(frame, x, y, ref currentRows); - - prevDCY = this.WriteBlock( - QuantIndex.Luminance, - prevDCY, - ref pixelConverter.Y, - ref temp1, - ref temp2, - ref onStackLuminanceQuantTable, - ref unzig, - ref emitBufferBase); - } - } - } - /// /// Writes the application header containing the JFIF identifier plus extra data. /// @@ -539,72 +262,6 @@ private void WriteApplicationHeader(ImageMetadata meta) this.outputStream.Write(this.buffer, 0, 20); } - /// - /// Writes a block of pixel data using the given quantization table, - /// returning the post-quantized DC value of the DCT-transformed block. - /// The block is in natural (not zig-zag) order. - /// - /// The quantization table index. - /// The previous DC value. - /// Source block - /// Temporal block to be used as FDCT Destination - /// Temporal block 2 - /// Quantization table - /// The 8x8 Unzig block. - /// The reference to the emit buffer. - /// The . - private int WriteBlock( - QuantIndex index, - int prevDC, - ref Block8x8F src, - ref Block8x8F tempDest1, - ref Block8x8F tempDest2, - ref Block8x8F quant, - ref ZigZag unZig, - ref byte emitBufferBase) - { - FastFloatingPointDCT.TransformFDCT(ref src, ref tempDest1, ref tempDest2); - - Block8x8F.Quantize(ref tempDest1, ref tempDest2, ref quant, ref unZig); - - int dc = (int)tempDest2[0]; - - // Emit the DC delta. - this.EmitHuffRLE((HuffIndex)((2 * (int)index) + 0), 0, dc - prevDC, ref emitBufferBase); - - // Emit the AC components. - var h = (HuffIndex)((2 * (int)index) + 1); - int runLength = 0; - - for (int zig = 1; zig < Block8x8F.Size; zig++) - { - int ac = (int)tempDest2[zig]; - - if (ac == 0) - { - runLength++; - } - else - { - while (runLength > 15) - { - this.EmitHuff(h, 0xf0, ref emitBufferBase); - runLength -= 16; - } - - this.EmitHuffRLE(h, runLength, ac, ref emitBufferBase); - runLength = 0; - } - } - - if (runLength > 0) - { - this.EmitHuff(h, 0x00, ref emitBufferBase); - } - - return dc; - } - /// /// Writes the Define Huffman Table marker and tables. /// @@ -638,34 +295,16 @@ private void WriteDefineHuffmanTables(int componentCount) this.WriteMarkerHeader(JpegConstants.Markers.DHT, markerlen); for (int i = 0; i < specs.Length; i++) { - ref HuffmanSpec spec = ref specs[i]; - int len = 0; - - fixed (byte* huffman = this.huffmanBuffer) - fixed (byte* count = spec.Count) - fixed (byte* values = spec.Values) - { - huffman[len++] = headers[i]; - - for (int c = 0; c < spec.Count.Length; c++) - { - huffman[len++] = count[c]; - } - - for (int v = 0; v < spec.Values.Length; v++) - { - huffman[len++] = values[v]; - } - } - - this.outputStream.Write(this.huffmanBuffer, 0, len); + this.outputStream.WriteByte(headers[i]); + this.outputStream.Write(specs[i].Count); + this.outputStream.Write(specs[i].Values); } } /// /// Writes the Define Quantization Marker and tables. /// - private void WriteDefineQuantizationTables() + private void WriteDefineQuantizationTables(ref Block8x8F luminanceQuantTable, ref Block8x8F chrominanceQuantTable) { // Marker + quantization table lengths int markerlen = 2 + (QuantizationTableCount * (1 + Block8x8F.Size)); @@ -677,8 +316,8 @@ private void WriteDefineQuantizationTables() byte[] dqt = new byte[dqtCount]; int offset = 0; - WriteDataToDqt(dqt, ref offset, QuantIndex.Luminance, ref this.luminanceQuantTable); - WriteDataToDqt(dqt, ref offset, QuantIndex.Chrominance, ref this.chrominanceQuantTable); + WriteDataToDqt(dqt, ref offset, QuantIndex.Luminance, ref luminanceQuantTable); + WriteDataToDqt(dqt, ref offset, QuantIndex.Chrominance, ref chrominanceQuantTable); this.outputStream.Write(dqt, 0, dqtCount); } @@ -982,7 +621,6 @@ private void WriteStartOfFrame(int width, int height, int componentCount) private void WriteStartOfScan(Image image, int componentCount, CancellationToken cancellationToken) where TPixel : unmanaged, IPixel { - // TODO: Need a JpegScanEncoder class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.) Span componentId = stackalloc byte[] { 0x01, @@ -1024,111 +662,16 @@ private void WriteStartOfScan(Image image, int componentCount, C this.buffer[sosSize] = 0x3f; // Se - End of spectral selection. this.buffer[sosSize + 1] = 0x00; // Ah + Ah (Successive approximation bit position high + low) this.outputStream.Write(this.buffer, 0, sosSize + 2); - - ref byte emitBufferBase = ref MemoryMarshal.GetReference(this.emitBuffer); - if (this.colorType == JpegColorType.Luminance) - { - this.EncodeGrayscale(image, cancellationToken, ref emitBufferBase); - } - else - { - switch (this.subsample) - { - case JpegSubsample.Ratio444: - this.Encode444(image, cancellationToken, ref emitBufferBase); - break; - case JpegSubsample.Ratio420: - this.Encode420(image, cancellationToken, ref emitBufferBase); - break; - } - } - - // Pad the last byte with 1's. - this.Emit(0x7f, 7, ref emitBufferBase); } /// - /// Encodes the image with subsampling. The Cb and Cr components are each subsampled - /// at a factor of 2 both horizontally and vertically. + /// Writes the EndOfImage marker. /// - /// The pixel format. - /// The pixel accessor providing access to the image pixels. - /// The token to monitor for cancellation. - /// The reference to the emit buffer. - private void Encode420(Image pixels, CancellationToken cancellationToken, ref byte emitBufferBase) - where TPixel : unmanaged, IPixel + private void WriteEndOfImageMarker() { - // TODO: Need a JpegScanEncoder class or struct that encapsulates the scan-encoding implementation. (Similar to JpegScanDecoder.) - Block8x8F b = default; - Span cb = stackalloc Block8x8F[4]; - Span cr = stackalloc Block8x8F[4]; - - Block8x8F temp1 = default; - Block8x8F temp2 = default; - - Block8x8F onStackLuminanceQuantTable = this.luminanceQuantTable; - Block8x8F onStackChrominanceQuantTable = this.chrominanceQuantTable; - - var unzig = ZigZag.CreateUnzigTable(); - - var pixelConverter = YCbCrForwardConverter.Create(); - - // ReSharper disable once InconsistentNaming - int prevDCY = 0, prevDCCb = 0, prevDCCr = 0; - ImageFrame frame = pixels.Frames.RootFrame; - Buffer2D pixelBuffer = frame.PixelBuffer; - RowOctet currentRows = default; - - for (int y = 0; y < pixels.Height; y += 16) - { - cancellationToken.ThrowIfCancellationRequested(); - for (int x = 0; x < pixels.Width; x += 16) - { - for (int i = 0; i < 4; i++) - { - int xOff = (i & 1) * 8; - int yOff = (i & 2) * 4; - - currentRows.Update(pixelBuffer, y + yOff); - pixelConverter.Convert(frame, x + xOff, y + yOff, ref currentRows); - - cb[i] = pixelConverter.Cb; - cr[i] = pixelConverter.Cr; - - prevDCY = this.WriteBlock( - QuantIndex.Luminance, - prevDCY, - ref pixelConverter.Y, - ref temp1, - ref temp2, - ref onStackLuminanceQuantTable, - ref unzig, - ref emitBufferBase); - } - - Block8x8F.Scale16X16To8X8(ref b, cb); - prevDCCb = this.WriteBlock( - QuantIndex.Chrominance, - prevDCCb, - ref b, - ref temp1, - ref temp2, - ref onStackChrominanceQuantTable, - ref unzig, - ref emitBufferBase); - - Block8x8F.Scale16X16To8X8(ref b, cr); - prevDCCr = this.WriteBlock( - QuantIndex.Chrominance, - prevDCCr, - ref b, - ref temp1, - ref temp2, - ref onStackChrominanceQuantTable, - ref unzig, - ref emitBufferBase); - } - } + this.buffer[0] = JpegConstants.Markers.XFF; + this.buffer[1] = JpegConstants.Markers.EOI; + this.outputStream.Write(this.buffer, 0, 2); } /// @@ -1145,5 +688,34 @@ private void WriteMarkerHeader(byte marker, int length) this.buffer[3] = (byte)(length & 0xff); this.outputStream.Write(this.buffer, 0, 4); } + + /// + /// Initializes quantization table. + /// + /// The quantization index. + /// The scaling factor. + /// The quantization table. + private static void InitQuantizationTable(int i, int scale, ref Block8x8F quant) + { + DebugGuard.MustBeBetweenOrEqualTo(i, 0, 1, nameof(i)); + ReadOnlySpan unscaledQuant = (i == 0) ? UnscaledQuant_Luminance : UnscaledQuant_Chrominance; + + for (int j = 0; j < Block8x8F.Size; j++) + { + int x = unscaledQuant[j]; + x = ((x * scale) + 50) / 100; + if (x < 1) + { + x = 1; + } + + if (x > 255) + { + x = 255; + } + + quant[j] = x; + } + } } } diff --git a/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs b/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs index fa9eb83917..cc75870e19 100644 --- a/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs +++ b/src/ImageSharp/Formats/Jpeg/JpegThrowHelper.cs @@ -46,5 +46,8 @@ public static void ThrowNotImplementedException(string errorMessage) [MethodImpl(InliningOptions.ColdPath)] public static void ThrowInvalidImageDimensions(int width, int height) => throw new InvalidImageContentException($"Invalid image dimensions: {width}x{height}."); + + [MethodImpl(InliningOptions.ColdPath)] + public static void ThrowDimensionsTooLarge(int width, int height) => throw new ImageFormatException($"Image is too large to encode at {width}x{height} for JPEG format."); } } diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs deleted file mode 100644 index ebd3e40130..0000000000 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Scale16X16To8X8.cs +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - -using System; -using BenchmarkDotNet.Attributes; -using SixLabors.ImageSharp.Formats.Jpeg.Components; - -namespace SixLabors.ImageSharp.Benchmarks.Format.Jpeg.Components -{ - [Config(typeof(Config.HwIntrinsics_SSE_AVX))] - public class Block8x8F_Scale16X16To8X8 - { - private Block8x8F source; - private readonly Block8x8F[] target = new Block8x8F[4]; - - [GlobalSetup] - public void Setup() - { - var random = new Random(); - - float[] f = new float[8 * 8]; - for (int i = 0; i < f.Length; i++) - { - f[i] = (float)random.NextDouble(); - } - - for (int i = 0; i < 4; i++) - { - this.target[i] = Block8x8F.Load(f); - } - - this.source = Block8x8F.Load(f); - } - - [Benchmark] - public void Scale16X16To8X8() => Block8x8F.Scale16X16To8X8(ref this.source, this.target); - } -} diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs index 5a9ceea946..47c6f2c7d4 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/EncodeJpeg.cs @@ -4,6 +4,7 @@ using System.Drawing.Imaging; using System.IO; using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Formats.Jpeg; using SixLabors.ImageSharp.PixelFormats; using SixLabors.ImageSharp.Tests; using SDImage = System.Drawing.Image; @@ -12,10 +13,22 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg { public class EncodeJpeg { - // System.Drawing needs this. - private Stream bmpStream; + [Params(75, 90, 100)] + public int Quality; + + private const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr; + + // System.Drawing private SDImage bmpDrawing; + private Stream bmpStream; + private ImageCodecInfo jpegCodec; + private EncoderParameters encoderParameters; + + // ImageSharp private Image bmpCore; + private JpegEncoder encoder420; + private JpegEncoder encoder444; + private MemoryStream destinationStream; [GlobalSetup] @@ -23,12 +36,20 @@ public void ReadImages() { if (this.bmpStream == null) { - const string TestImage = TestImages.Jpeg.BenchmarkSuite.Jpeg420Exif_MidSizeYCbCr; this.bmpStream = File.OpenRead(Path.Combine(TestEnvironment.InputImagesDirectoryFullPath, TestImage)); + this.bmpCore = Image.Load(this.bmpStream); this.bmpCore.Metadata.ExifProfile = null; + this.encoder420 = new JpegEncoder { Quality = this.Quality, Subsample = JpegSubsample.Ratio420 }; + this.encoder444 = new JpegEncoder { Quality = this.Quality, Subsample = JpegSubsample.Ratio444 }; + this.bmpStream.Position = 0; this.bmpDrawing = SDImage.FromStream(this.bmpStream); + this.jpegCodec = GetEncoder(ImageFormat.Jpeg); + this.encoderParameters = new EncoderParameters(1); + // Quality cast to long is necessary + this.encoderParameters.Param[0] = new EncoderParameter(Encoder.Quality, (long)this.Quality); + this.destinationStream = new MemoryStream(); } } @@ -38,36 +59,72 @@ public void Cleanup() { this.bmpStream.Dispose(); this.bmpStream = null; + + this.destinationStream.Dispose(); + this.destinationStream = null; + this.bmpCore.Dispose(); this.bmpDrawing.Dispose(); + + this.encoderParameters.Dispose(); } - [Benchmark(Baseline = true, Description = "System.Drawing Jpeg")] + [Benchmark(Baseline = true, Description = "System.Drawing Jpeg 4:2:0")] public void JpegSystemDrawing() { - this.bmpDrawing.Save(this.destinationStream, ImageFormat.Jpeg); + this.bmpDrawing.Save(this.destinationStream, this.jpegCodec, this.encoderParameters); + this.destinationStream.Seek(0, SeekOrigin.Begin); + } + + [Benchmark(Description = "ImageSharp Jpeg 4:2:0")] + public void JpegCore420() + { + this.bmpCore.SaveAsJpeg(this.destinationStream, this.encoder420); this.destinationStream.Seek(0, SeekOrigin.Begin); } - [Benchmark(Description = "ImageSharp Jpeg")] - public void JpegCore() + [Benchmark(Description = "ImageSharp Jpeg 4:4:4")] + public void JpegCore444() { - this.bmpCore.SaveAsJpeg(this.destinationStream); + this.bmpCore.SaveAsJpeg(this.destinationStream, this.encoder444); this.destinationStream.Seek(0, SeekOrigin.Begin); } + + // https://docs.microsoft.com/en-us/dotnet/api/system.drawing.imaging.encoderparameter?redirectedfrom=MSDN&view=net-5.0 + private static ImageCodecInfo GetEncoder(ImageFormat format) + { + ImageCodecInfo[] codecs = ImageCodecInfo.GetImageDecoders(); + foreach (ImageCodecInfo codec in codecs) + { + if (codec.FormatID == format.Guid) + { + return codec; + } + } + return null; + } } } /* -BenchmarkDotNet=v0.12.1, OS=Windows 10.0.18363.959 (1909/November2018Update/19H2) -Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores -.NET Core SDK=3.1.302 - [Host] : .NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT - DefaultJob : .NET Core 3.1.6 (CoreCLR 4.700.20.26901, CoreFX 4.700.20.31603), X64 RyuJIT +BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042 +Intel Core i7-6700K CPU 4.00GHz (Skylake), 1 CPU, 8 logical and 4 physical cores +.NET Core SDK=6.0.100-preview.3.21202.5 + [Host] : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT [AttachedDebugger] + DefaultJob : .NET Core 3.1.13 (CoreCLR 4.700.21.11102, CoreFX 4.700.21.11602), X64 RyuJIT -| Method | Mean | Error | StdDev | Ratio | RatioSD | -|---------------------- |---------:|----------:|----------:|------:|--------:| -| 'System.Drawing Jpeg' | 4.297 ms | 0.0244 ms | 0.0228 ms | 1.00 | 0.00 | -| 'ImageSharp Jpeg' | 5.286 ms | 0.1034 ms | 0.0967 ms | 1.23 | 0.02 | +| Method | Quality | Mean | Error | StdDev | Ratio | RatioSD | +|---------------------------- |-------- |---------:|---------:|---------:|------:|--------:| +| 'System.Drawing Jpeg 4:2:0' | 75 | 30.60 ms | 0.496 ms | 0.464 ms | 1.00 | 0.00 | +| 'ImageSharp Jpeg 4:2:0' | 75 | 29.86 ms | 0.350 ms | 0.311 ms | 0.98 | 0.02 | +| 'ImageSharp Jpeg 4:4:4' | 75 | 45.36 ms | 0.899 ms | 1.036 ms | 1.48 | 0.05 | +| | | | | | | | +| 'System.Drawing Jpeg 4:2:0' | 90 | 34.05 ms | 0.669 ms | 0.687 ms | 1.00 | 0.00 | +| 'ImageSharp Jpeg 4:2:0' | 90 | 37.26 ms | 0.706 ms | 0.660 ms | 1.10 | 0.03 | +| 'ImageSharp Jpeg 4:4:4' | 90 | 52.54 ms | 0.579 ms | 0.514 ms | 1.55 | 0.04 | +| | | | | | | | +| 'System.Drawing Jpeg 4:2:0' | 100 | 39.36 ms | 0.267 ms | 0.237 ms | 1.00 | 0.00 | +| 'ImageSharp Jpeg 4:2:0' | 100 | 42.44 ms | 0.410 ms | 0.383 ms | 1.08 | 0.01 | +| 'ImageSharp Jpeg 4:4:4' | 100 | 70.88 ms | 0.508 ms | 0.450 ms | 1.80 | 0.02 | */ diff --git a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs index 1db4072932..9aafb6936b 100644 --- a/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs +++ b/tests/ImageSharp.Benchmarks/Format/Jpeg/Components/Encoder/YCbCrForwardConverterBenchmark.cs @@ -37,7 +37,7 @@ public void ConvertLut() Block8x8F cb = default; Block8x8F cr = default; - this.converter.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + this.converter.Convert444(this.data.AsSpan(), ref y, ref cb, ref cr); } [Benchmark] @@ -49,7 +49,7 @@ public void ConvertVectorized() if (RgbToYCbCrConverterVectorized.IsSupported) { - RgbToYCbCrConverterVectorized.Convert(this.data.AsSpan(), ref y, ref cb, ref cr); + RgbToYCbCrConverterVectorized.Convert444(this.data.AsSpan(), ref y, ref cb, ref cr); } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs index 75ad5427c7..d49a6498cd 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs @@ -2,10 +2,12 @@ // Licensed under the Apache License, Version 2.0. using System; - +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils; - +using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; using Xunit.Abstractions; @@ -22,94 +24,180 @@ public FastFloatingPoint(ITestOutputHelper output) { } - [Fact] - public void IDCT2D8x4_LeftPart() + // Reference tests + [Theory] + [InlineData(1)] + [InlineData(2)] + [InlineData(3)] + public void LLM_TransformIDCT_CompareToNonOptimized(int seed) { - float[] sourceArray = Create8x8FloatData(); - var expectedDestArray = new float[64]; + float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); + + var source = Block8x8F.Load(sourceArray); - ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(sourceArray, expectedDestArray); + Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source); + + var temp = default(Block8x8F); + var actual = default(Block8x8F); + FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); - var source = default(Block8x8F); - source.LoadFrom(sourceArray); + this.CompareBlocks(expected, actual, 1f); + } - var dest = default(Block8x8F); + [Theory] + [InlineData(1)] + [InlineData(2)] + [InlineData(3)] + public void LLM_TransformIDCT_CompareToAccurate(int seed) + { + float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); - FastFloatingPointDCT.IDCT8x4_LeftPart(ref source, ref dest); + var source = Block8x8F.Load(sourceArray); - var actualDestArray = new float[64]; - dest.ScaledCopyTo(actualDestArray); + Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source); - this.Print8x8Data(expectedDestArray); - this.Output.WriteLine("**************"); - this.Print8x8Data(actualDestArray); + var temp = default(Block8x8F); + var actual = default(Block8x8F); + FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); - Assert.Equal(expectedDestArray, actualDestArray); + this.CompareBlocks(expected, actual, 1f); } - [Fact] - public void IDCT2D8x4_RightPart() + // Inverse transform + [Theory] + [InlineData(1)] + [InlineData(2)] + public void IDCT8x4_LeftPart(int seed) { - float[] sourceArray = Create8x8FloatData(); - var expectedDestArray = new float[64]; - - ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(sourceArray.AsSpan(4), expectedDestArray.AsSpan(4)); + Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); + var srcBlock = default(Block8x8F); + srcBlock.LoadFrom(src); - var source = default(Block8x8F); - source.LoadFrom(sourceArray); + var destBlock = default(Block8x8F); - var dest = default(Block8x8F); + var expectedDest = new float[64]; - FastFloatingPointDCT.IDCT8x4_RightPart(ref source, ref dest); + // reference + ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest); - var actualDestArray = new float[64]; - dest.ScaledCopyTo(actualDestArray); + // testee + FastFloatingPointDCT.IDCT8x4_LeftPart(ref srcBlock, ref destBlock); - this.Print8x8Data(expectedDestArray); - this.Output.WriteLine("**************"); - this.Print8x8Data(actualDestArray); + var actualDest = new float[64]; + destBlock.ScaledCopyTo(actualDest); - Assert.Equal(expectedDestArray, actualDestArray); + Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); } [Theory] [InlineData(1)] [InlineData(2)] - [InlineData(3)] - public void LLM_TransformIDCT_CompareToNonOptimized(int seed) + public void IDCT8x4_RightPart(int seed) { - float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); + Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); + var srcBlock = default(Block8x8F); + srcBlock.LoadFrom(src); - var source = Block8x8F.Load(sourceArray); + var destBlock = default(Block8x8F); - Block8x8F expected = ReferenceImplementations.LLM_FloatingPoint_DCT.TransformIDCT(ref source); + var expectedDest = new float[64]; - var temp = default(Block8x8F); - var actual = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); + // reference + ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); - this.CompareBlocks(expected, actual, 1f); + // testee + FastFloatingPointDCT.IDCT8x4_RightPart(ref srcBlock, ref destBlock); + + var actualDest = new float[64]; + destBlock.ScaledCopyTo(actualDest); + + Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); } [Theory] [InlineData(1)] [InlineData(2)] - [InlineData(3)] - public void LLM_TransformIDCT_CompareToAccurate(int seed) + public void IDCT8x8_Avx(int seed) { - float[] sourceArray = Create8x8RoundedRandomFloatData(-1000, 1000, seed); +#if SUPPORTS_RUNTIME_INTRINSICS + var skip = !Avx.IsSupported; +#else + var skip = true; +#endif + + if (skip) + { + this.Output.WriteLine("No AVX present, skipping test!"); + return; + } - var source = Block8x8F.Load(sourceArray); + Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); + var srcBlock = default(Block8x8F); + srcBlock.LoadFrom(src); - Block8x8F expected = ReferenceImplementations.AccurateDCT.TransformIDCT(ref source); + var destBlock = default(Block8x8F); - var temp = default(Block8x8F); - var actual = default(Block8x8F); - FastFloatingPointDCT.TransformIDCT(ref source, ref actual, ref temp); + var expectedDest = new float[64]; - this.CompareBlocks(expected, actual, 1f); + // reference, left part + ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src, expectedDest); + + // reference, right part + ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); + + // testee, whole 8x8 + FastFloatingPointDCT.IDCT8x8_Avx(ref srcBlock, ref destBlock); + + var actualDest = new float[64]; + destBlock.ScaledCopyTo(actualDest); + + Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); } + [Theory] + [InlineData(1)] + [InlineData(2)] + public void TransformIDCT(int seed) + { + static void RunTest(string serialized) + { + int seed = FeatureTestRunner.Deserialize(serialized); + + Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); + var srcBlock = default(Block8x8F); + srcBlock.LoadFrom(src); + + var destBlock = default(Block8x8F); + + var expectedDest = new float[64]; + var temp1 = new float[64]; + var temp2 = default(Block8x8F); + + // reference + ReferenceImplementations.LLM_FloatingPoint_DCT.IDCT2D_llm(src, expectedDest, temp1); + + // testee + FastFloatingPointDCT.TransformIDCT(ref srcBlock, ref destBlock, ref temp2); + + var actualDest = new float[64]; + destBlock.ScaledCopyTo(actualDest); + + Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); + } + + // 3 paths: + // 1. AllowAll - call avx/fma implementation + // 2. DisableFMA - call avx implementation without fma acceleration + // 3. DisableAvx - call fallback code of Vector4 implementation + // + // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + seed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX); + } + + // Forward transform [Theory] [InlineData(1)] [InlineData(2)] @@ -123,7 +211,10 @@ public void FDCT8x4_LeftPart(int seed) var expectedDest = new float[64]; + // reference ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest); + + // testee FastFloatingPointDCT.FDCT8x4_LeftPart(ref srcBlock, ref destBlock); var actualDest = new float[64]; @@ -145,7 +236,10 @@ public void FDCT8x4_RightPart(int seed) var expectedDest = new float[64]; + // reference ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); + + // testee FastFloatingPointDCT.FDCT8x4_RightPart(ref srcBlock, ref destBlock); var actualDest = new float[64]; @@ -157,8 +251,19 @@ public void FDCT8x4_RightPart(int seed) [Theory] [InlineData(1)] [InlineData(2)] - public void TransformFDCT(int seed) + public void FDCT8x8_Avx(int seed) { +#if SUPPORTS_RUNTIME_INTRINSICS + var skip = !Avx.IsSupported; +#else + var skip = true; +#endif + if (skip) + { + this.Output.WriteLine("No AVX present, skipping test!"); + return; + } + Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); var srcBlock = default(Block8x8F); srcBlock.LoadFrom(src); @@ -166,17 +271,64 @@ public void TransformFDCT(int seed) var destBlock = default(Block8x8F); var expectedDest = new float[64]; - var temp1 = new float[64]; - var temp2 = default(Block8x8F); - ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true); - FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false); + // reference, left part + ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src, expectedDest); + + // reference, right part + ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D8x4_32f(src.Slice(4), expectedDest.AsSpan(4)); + + // testee, whole 8x8 + FastFloatingPointDCT.FDCT8x8_Avx(ref srcBlock, ref destBlock); var actualDest = new float[64]; destBlock.ScaledCopyTo(actualDest); Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); } + + [Theory] + [InlineData(1)] + [InlineData(2)] + public void TransformFDCT(int seed) + { + static void RunTest(string serialized) + { + int seed = FeatureTestRunner.Deserialize(serialized); + + Span src = Create8x8RoundedRandomFloatData(-200, 200, seed); + var srcBlock = default(Block8x8F); + srcBlock.LoadFrom(src); + + var destBlock = default(Block8x8F); + + var expectedDest = new float[64]; + var temp1 = new float[64]; + var temp2 = default(Block8x8F); + + // reference + ReferenceImplementations.LLM_FloatingPoint_DCT.FDCT2D_llm(src, expectedDest, temp1, downscaleBy8: true); + + // testee + FastFloatingPointDCT.TransformFDCT(ref srcBlock, ref destBlock, ref temp2, false); + + var actualDest = new float[64]; + destBlock.ScaledCopyTo(actualDest); + + Assert.Equal(actualDest, expectedDest, new ApproximateFloatComparer(1f)); + } + + // 3 paths: + // 1. AllowAll - call avx/fma implementation + // 2. DisableFMA - call avx implementation without fma acceleration + // 3. DisableAvx - call fallback code of Vector4 implementation + // + // DisableSSE isn't needed because fallback Vector4 code will compile to either sse or fallback code with same result + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + seed, + HwIntrinsics.AllowAll | HwIntrinsics.DisableFMA | HwIntrinsics.DisableAVX); + } } } } diff --git a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs index 9a6fc8d6fd..0d5b550384 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/RgbToYCbCrConverterTests.cs @@ -1,7 +1,13 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif using SixLabors.ImageSharp.ColorSpaces; using SixLabors.ImageSharp.Formats.Jpeg.Components; using SixLabors.ImageSharp.Formats.Jpeg.Components.Encoder; @@ -23,22 +29,23 @@ public RgbToYCbCrConverterTests(ITestOutputHelper output) private ITestOutputHelper Output { get; } [Fact] - public void TestLutConverter() + public void TestConverterLut444() { - Rgb24[] data = CreateTestData(); + int dataSize = 8 * 8; + Rgb24[] data = CreateTestData(dataSize); var target = RgbToYCbCrConverterLut.Create(); Block8x8F y = default; Block8x8F cb = default; Block8x8F cr = default; - target.Convert(data.AsSpan(), ref y, ref cb, ref cr); + target.Convert444(data.AsSpan(), ref y, ref cb, ref cr); - Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F)); + Verify444(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(1F)); } [Fact] - public void TestVectorizedConverter() + public void TestConverterVectorized444() { if (!RgbToYCbCrConverterVectorized.IsSupported) { @@ -46,18 +53,187 @@ public void TestVectorizedConverter() return; } - Rgb24[] data = CreateTestData(); + int dataSize = 8 * 8; + Rgb24[] data = CreateTestData(dataSize); Block8x8F y = default; Block8x8F cb = default; Block8x8F cr = default; - RgbToYCbCrConverterVectorized.Convert(data.AsSpan(), ref y, ref cb, ref cr); + RgbToYCbCrConverterVectorized.Convert444(data.AsSpan(), ref y, ref cb, ref cr); - Verify(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); + Verify444(data, ref y, ref cb, ref cr, new ApproximateColorSpaceComparer(0.0001F)); } - private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref Block8x8F cbResult, ref Block8x8F crResult, ApproximateColorSpaceComparer comparer) + [Fact] + public void TestConverterLut420() + { + int dataSize = 16 * 16; + Span data = CreateTestData(dataSize).AsSpan(); + var target = RgbToYCbCrConverterLut.Create(); + + var yBlocks = new Block8x8F[4]; + var cb = default(Block8x8F); + var cr = default(Block8x8F); + + target.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0); + target.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1); + + Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F)); + } + + [Fact] + public void TestConverterVectorized420() + { + if (!RgbToYCbCrConverterVectorized.IsSupported) + { + this.Output.WriteLine("No AVX and/or FMA present, skipping test!"); + return; + } + + int dataSize = 16 * 16; + Span data = CreateTestData(dataSize).AsSpan(); + + var yBlocks = new Block8x8F[4]; + var cb = default(Block8x8F); + var cr = default(Block8x8F); + + RgbToYCbCrConverterVectorized.Convert420(data, ref yBlocks[0], ref yBlocks[1], ref cb, ref cr, 0); + RgbToYCbCrConverterVectorized.Convert420(data.Slice(16 * 8), ref yBlocks[2], ref yBlocks[3], ref cb, ref cr, 1); + + Verify420(data, yBlocks, ref cb, ref cr, new ApproximateFloatComparer(1F)); + } + +#if SUPPORTS_RUNTIME_INTRINSICS + [Theory] + [InlineData(1)] + [InlineData(2)] + [InlineData(3)] + public void Scale16x2_8x1(int seed) + { + if (!Avx2.IsSupported) + { + return; + } + + Span data = new Random(seed).GenerateRandomFloatArray(Vector256.Count * 4, -1000, 1000); + + // Act: + Vector256 resultVector = RgbToYCbCrConverterVectorized.Scale16x2_8x1(MemoryMarshal.Cast>(data)); + ref float result = ref Unsafe.As, float>(ref resultVector); + + // Assert: + // Comparison epsilon is tricky but 10^(-4) is good enough (?) + var comparer = new ApproximateFloatComparer(0.0001f); + for (int i = 0; i < Vector256.Count; i++) + { + float actual = Unsafe.Add(ref result, i); + float expected = CalculateAverage16x2_8x1(data, i); + + Assert.True(comparer.Equals(actual, expected), $"Pos {i}, Expected: {expected}, Actual: {actual}"); + } + + static float CalculateAverage16x2_8x1(Span data, int index) + { + int upIdx = index * 2; + int lowIdx = (index + 8) * 2; + return 0.25f * (data[upIdx] + data[upIdx + 1] + data[lowIdx] + data[lowIdx + 1]); + } + } +#endif + + private static void Verify444( + ReadOnlySpan data, + ref Block8x8F yResult, + ref Block8x8F cbResult, + ref Block8x8F crResult, + ApproximateColorSpaceComparer comparer) + { + Block8x8F y = default; + Block8x8F cb = default; + Block8x8F cr = default; + + RgbToYCbCr(data, ref y, ref cb, ref cr); + + for (int i = 0; i < Block8x8F.Size; i++) + { + Assert.True(comparer.Equals(new YCbCr(y[i], cb[i], cr[i]), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y[i]} == {yResult[i]}, {cb[i]} == {cbResult[i]}, {cr[i]} == {crResult[i]}"); + } + } + + private static void Verify420( + ReadOnlySpan data, + Block8x8F[] yResult, + ref Block8x8F cbResult, + ref Block8x8F crResult, + ApproximateFloatComparer comparer) + { + var trueBlock = default(Block8x8F); + var cbTrue = new Block8x8F[4]; + var crTrue = new Block8x8F[4]; + + Span tempData = new Rgb24[8 * 8].AsSpan(); + + // top left + Copy8x8(data, tempData); + RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[0], ref crTrue[0]); + VerifyBlock(ref yResult[0], ref trueBlock, comparer); + + // top right + Copy8x8(data.Slice(8), tempData); + RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[1], ref crTrue[1]); + VerifyBlock(ref yResult[1], ref trueBlock, comparer); + + // bottom left + Copy8x8(data.Slice(8 * 16), tempData); + RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[2], ref crTrue[2]); + VerifyBlock(ref yResult[2], ref trueBlock, comparer); + + // bottom right + Copy8x8(data.Slice((8 * 16) + 8), tempData); + RgbToYCbCr(tempData, ref trueBlock, ref cbTrue[3], ref crTrue[3]); + VerifyBlock(ref yResult[3], ref trueBlock, comparer); + + // verify Cb + Scale16X16To8X8(ref trueBlock, cbTrue); + VerifyBlock(ref cbResult, ref trueBlock, comparer); + + // verify Cr + Scale16X16To8X8(ref trueBlock, crTrue); + VerifyBlock(ref crResult, ref trueBlock, comparer); + + + // extracts 8x8 blocks from 16x8 memory region + static void Copy8x8(ReadOnlySpan source, Span dest) + { + for (int i = 0; i < 8; i++) + { + source.Slice(i * 16, 8).CopyTo(dest.Slice(i * 8)); + } + } + + // scales 16x16 to 8x8, used in chroma subsampling tests + static void Scale16X16To8X8(ref Block8x8F dest, ReadOnlySpan source) + { + for (int i = 0; i < 4; i++) + { + int dstOff = ((i & 2) << 4) | ((i & 1) << 2); + Block8x8F iSource = source[i]; + + for (int y = 0; y < 4; y++) + { + for (int x = 0; x < 4; x++) + { + int j = (16 * y) + (2 * x); + float sum = iSource[j] + iSource[j + 1] + iSource[j + 8] + iSource[j + 9]; + dest[(8 * y) + x + dstOff] = (sum + 2) * .25F; + } + } + } + } + } + + private static void RgbToYCbCr(ReadOnlySpan data, ref Block8x8F y, ref Block8x8F cb, ref Block8x8F cr) { for (int i = 0; i < data.Length; i++) { @@ -65,17 +241,23 @@ private static void Verify(ReadOnlySpan data, ref Block8x8F yResult, ref int g = data[i].G; int b = data[i].B; - float y = (0.299F * r) + (0.587F * g) + (0.114F * b); - float cb = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); - float cr = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); + y[i] = (0.299F * r) + (0.587F * g) + (0.114F * b); + cb[i] = 128F + ((-0.168736F * r) - (0.331264F * g) + (0.5F * b)); + cr[i] = 128F + ((0.5F * r) - (0.418688F * g) - (0.081312F * b)); + } + } - Assert.True(comparer.Equals(new YCbCr(y, cb, cr), new YCbCr(yResult[i], cbResult[i], crResult[i])), $"Pos {i}, Expected {y} == {yResult[i]}, {cb} == {cbResult[i]}, {cr} == {crResult[i]}"); + private static void VerifyBlock(ref Block8x8F res, ref Block8x8F target, ApproximateFloatComparer comparer) + { + for (int i = 0; i < Block8x8F.Size; i++) + { + Assert.True(comparer.Equals(res[i], target[i]), $"Pos {i}, Expected: {target[i]}, Got: {res[i]}"); } } - private static Rgb24[] CreateTestData() + private static Rgb24[] CreateTestData(int size) { - var data = new Rgb24[64]; + var data = new Rgb24[size]; var r = new Random(); var random = new byte[3];