Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Used inline SIMD vectors if they are constants #2122

Merged
merged 7 commits into from
Jun 14, 2022
5 changes: 2 additions & 3 deletions src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ internal partial struct Block8x8F
[FieldOffset(224)]
public Vector256<float> V7;

private static readonly Vector256<int> MultiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);

private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
Expand All @@ -45,14 +43,15 @@ private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F
ref Vector256<float> bBase = ref b.V0;

ref Vector256<short> destRef = ref dest.V01;
Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);

for (nint i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));

Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), MultiplyIntoInt16ShuffleMask).AsInt16();
row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();

Unsafe.Add(ref destRef, (IntPtr)((uint)i / 2)) = row;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,6 @@ namespace SixLabors.ImageSharp.Formats.Jpeg.Components
{
internal static partial class FastFloatingPointDCT
{
#pragma warning disable SA1310, SA1311, IDE1006 // naming rule violation warnings
private static readonly Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
private static readonly Vector256<float> mm256_F_0_3826 = Vector256.Create(0.382683433f);
private static readonly Vector256<float> mm256_F_0_5411 = Vector256.Create(0.541196100f);
private static readonly Vector256<float> mm256_F_1_3065 = Vector256.Create(1.306562965f);

private static readonly Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
private static readonly Vector256<float> mm256_F_1_8477 = Vector256.Create(1.847759065f);
private static readonly Vector256<float> mm256_F_n1_0823 = Vector256.Create(-1.082392200f);
private static readonly Vector256<float> mm256_F_n2_6131 = Vector256.Create(-2.613125930f);
#pragma warning restore SA1310, SA1311, IDE1006

/// <summary>
/// Apply floating point FDCT inplace using simd operations.
/// </summary>
Expand Down Expand Up @@ -57,6 +45,7 @@ static void FDCT8x8_1D_Avx(ref Block8x8F block)
block.V0 = Avx.Add(tmp10, tmp11);
block.V4 = Avx.Subtract(tmp10, tmp11);

Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
block.V2 = Avx.Add(tmp13, z1);
block.V6 = Avx.Subtract(tmp13, z1);
Expand All @@ -66,9 +55,9 @@ static void FDCT8x8_1D_Avx(ref Block8x8F block)
tmp11 = Avx.Add(tmp5, tmp6);
tmp12 = Avx.Add(tmp6, tmp7);

Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), mm256_F_0_3826);
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_0_5411, tmp10);
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, mm256_F_1_3065, tmp12);
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f)); // mm256_F_0_3826
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);

Vector256<float> z11 = Avx.Add(tmp7, z3);
Expand Down Expand Up @@ -109,6 +98,7 @@ static void IDCT8x8_1D_Avx(ref Block8x8F block)
Vector256<float> tmp10 = Avx.Add(z5, tmp2);
Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);

Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubstract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);

Expand All @@ -131,10 +121,10 @@ static void IDCT8x8_1D_Avx(ref Block8x8F block)
tmp7 = Avx.Add(z11, z13);
tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);

z5 = Avx.Multiply(Avx.Add(z10, z12), mm256_F_1_8477);
z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f)); // mm256_F_1_8477

tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, mm256_F_n1_0823);
tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, mm256_F_n2_6131);
tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131

tmp6 = Avx.Subtract(tmp12, tmp7);
tmp5 = Avx.Subtract(tmp11, tmp6);
Expand Down
Loading