@@ -92,6 +92,11 @@ public static void AdjustToFDCT(ref Block8x8F quantTable)
9292 tableRef = 0.125f / ( tableRef * Unsafe . Add ( ref multipliersRef , i ) ) ;
9393 tableRef = ref Unsafe . Add ( ref tableRef , 1 ) ;
9494 }
95+
96+ // Spectral macroblocks are not transposed before quantization
97+ // Transpose is done after quantization at zig-zag stage
98+ // so we must transpose quantization table
99+ quantTable . TransposeInplace ( ) ;
95100 }
96101
97102 /// <summary>
@@ -133,14 +138,9 @@ public static void TransformFDCT(ref Block8x8F block)
133138 }
134139 else
135140#endif
136- if ( Vector . IsHardwareAccelerated )
137141 {
138142 FDCT_Vector4 ( ref block ) ;
139143 }
140- else
141- {
142- FDCT_Scalar ( ref block ) ;
143- }
144144 }
145145
146146 /// <summary>
@@ -217,136 +217,17 @@ static void IDCT8x4_Vector4(ref Vector4 vecRef)
217217 }
218218 }
219219
220- /// <summary>
221- /// Apply 2D floating point FDCT inplace using scalar operations.
222- /// </summary>
223- /// <remarks>
224- /// Ported from libjpeg-turbo https://github.com/libjpeg-turbo/libjpeg-turbo/blob/main/jfdctflt.c.
225- /// </remarks>
226- /// <param name="block">Input block.</param>
227- private static void FDCT_Scalar ( ref Block8x8F block )
228- {
229- const int dctSize = 8 ;
230-
231- float tmp0 , tmp1 , tmp2 , tmp3 , tmp4 , tmp5 , tmp6 , tmp7 ;
232- float tmp10 , tmp11 , tmp12 , tmp13 ;
233- float z1 , z2 , z3 , z4 , z5 , z11 , z13 ;
234-
235- // First pass - process rows
236- ref float blockRef = ref Unsafe . As < Block8x8F , float > ( ref block ) ;
237- for ( int ctr = 7 ; ctr >= 0 ; ctr -- )
238- {
239- tmp0 = Unsafe . Add ( ref blockRef , 0 ) + Unsafe . Add ( ref blockRef , 7 ) ;
240- tmp7 = Unsafe . Add ( ref blockRef , 0 ) - Unsafe . Add ( ref blockRef , 7 ) ;
241- tmp1 = Unsafe . Add ( ref blockRef , 1 ) + Unsafe . Add ( ref blockRef , 6 ) ;
242- tmp6 = Unsafe . Add ( ref blockRef , 1 ) - Unsafe . Add ( ref blockRef , 6 ) ;
243- tmp2 = Unsafe . Add ( ref blockRef , 2 ) + Unsafe . Add ( ref blockRef , 5 ) ;
244- tmp5 = Unsafe . Add ( ref blockRef , 2 ) - Unsafe . Add ( ref blockRef , 5 ) ;
245- tmp3 = Unsafe . Add ( ref blockRef , 3 ) + Unsafe . Add ( ref blockRef , 4 ) ;
246- tmp4 = Unsafe . Add ( ref blockRef , 3 ) - Unsafe . Add ( ref blockRef , 4 ) ;
247-
248- // Even part
249- tmp10 = tmp0 + tmp3 ;
250- tmp13 = tmp0 - tmp3 ;
251- tmp11 = tmp1 + tmp2 ;
252- tmp12 = tmp1 - tmp2 ;
253-
254- Unsafe . Add ( ref blockRef , 0 ) = tmp10 + tmp11 ;
255- Unsafe . Add ( ref blockRef , 4 ) = tmp10 - tmp11 ;
256-
257- z1 = ( tmp12 + tmp13 ) * 0.707106781f ;
258- Unsafe . Add ( ref blockRef , 2 ) = tmp13 + z1 ;
259- Unsafe . Add ( ref blockRef , 6 ) = tmp13 - z1 ;
260-
261- // Odd part
262- tmp10 = tmp4 + tmp5 ;
263- tmp11 = tmp5 + tmp6 ;
264- tmp12 = tmp6 + tmp7 ;
265-
266- z5 = ( tmp10 - tmp12 ) * 0.382683433f ;
267- z2 = ( 0.541196100f * tmp10 ) + z5 ;
268- z4 = ( 1.306562965f * tmp12 ) + z5 ;
269- z3 = tmp11 * 0.707106781f ;
270-
271- z11 = tmp7 + z3 ;
272- z13 = tmp7 - z3 ;
273-
274- Unsafe . Add ( ref blockRef , 5 ) = z13 + z2 ;
275- Unsafe . Add ( ref blockRef , 3 ) = z13 - z2 ;
276- Unsafe . Add ( ref blockRef , 1 ) = z11 + z4 ;
277- Unsafe . Add ( ref blockRef , 7 ) = z11 - z4 ;
278-
279- blockRef = ref Unsafe . Add ( ref blockRef , dctSize ) ;
280- }
281-
282- // Second pass - process columns
283- blockRef = ref Unsafe . As < Block8x8F , float > ( ref block ) ;
284- for ( int ctr = 7 ; ctr >= 0 ; ctr -- )
285- {
286- tmp0 = Unsafe . Add ( ref blockRef , dctSize * 0 ) + Unsafe . Add ( ref blockRef , dctSize * 7 ) ;
287- tmp7 = Unsafe . Add ( ref blockRef , dctSize * 0 ) - Unsafe . Add ( ref blockRef , dctSize * 7 ) ;
288- tmp1 = Unsafe . Add ( ref blockRef , dctSize * 1 ) + Unsafe . Add ( ref blockRef , dctSize * 6 ) ;
289- tmp6 = Unsafe . Add ( ref blockRef , dctSize * 1 ) - Unsafe . Add ( ref blockRef , dctSize * 6 ) ;
290- tmp2 = Unsafe . Add ( ref blockRef , dctSize * 2 ) + Unsafe . Add ( ref blockRef , dctSize * 5 ) ;
291- tmp5 = Unsafe . Add ( ref blockRef , dctSize * 2 ) - Unsafe . Add ( ref blockRef , dctSize * 5 ) ;
292- tmp3 = Unsafe . Add ( ref blockRef , dctSize * 3 ) + Unsafe . Add ( ref blockRef , dctSize * 4 ) ;
293- tmp4 = Unsafe . Add ( ref blockRef , dctSize * 3 ) - Unsafe . Add ( ref blockRef , dctSize * 4 ) ;
294-
295- // Even part
296- tmp10 = tmp0 + tmp3 ;
297- tmp13 = tmp0 - tmp3 ;
298- tmp11 = tmp1 + tmp2 ;
299- tmp12 = tmp1 - tmp2 ;
300-
301- Unsafe . Add ( ref blockRef , dctSize * 0 ) = tmp10 + tmp11 ;
302- Unsafe . Add ( ref blockRef , dctSize * 4 ) = tmp10 - tmp11 ;
303-
304- z1 = ( tmp12 + tmp13 ) * 0.707106781f ;
305- Unsafe . Add ( ref blockRef , dctSize * 2 ) = tmp13 + z1 ;
306- Unsafe . Add ( ref blockRef , dctSize * 6 ) = tmp13 - z1 ;
307-
308- // Odd part
309- tmp10 = tmp4 + tmp5 ;
310- tmp11 = tmp5 + tmp6 ;
311- tmp12 = tmp6 + tmp7 ;
312-
313- z5 = ( tmp10 - tmp12 ) * 0.382683433f ;
314- z2 = ( 0.541196100f * tmp10 ) + z5 ;
315- z4 = ( 1.306562965f * tmp12 ) + z5 ;
316- z3 = tmp11 * 0.707106781f ;
317-
318- z11 = tmp7 + z3 ;
319- z13 = tmp7 - z3 ;
320-
321- Unsafe . Add ( ref blockRef , dctSize * 5 ) = z13 + z2 ;
322- Unsafe . Add ( ref blockRef , dctSize * 3 ) = z13 - z2 ;
323- Unsafe . Add ( ref blockRef , dctSize * 1 ) = z11 + z4 ;
324- Unsafe . Add ( ref blockRef , dctSize * 7 ) = z11 - z4 ;
325-
326- blockRef = ref Unsafe . Add ( ref blockRef , 1 ) ;
327- }
328- }
329-
330220 /// <summary>
331221 /// Apply floating point FDCT inplace using <see cref="Vector4"/> API.
332222 /// </summary>
333- /// <remarks>
334- /// This implementation must be called only if hardware supports 4
335- /// floating point numbers vector. Otherwise explicit scalar
336- /// implementation <see cref="FDCT_Scalar"/> is faster
337- /// because it does not rely on block transposition.
338- /// </remarks>
339223 /// <param name="block">Input block.</param>
340224 public static void FDCT_Vector4 ( ref Block8x8F block )
341225 {
342- DebugGuard . IsTrue ( Vector . IsHardwareAccelerated , "Scalar implementation should be called for non-accelerated hardware." ) ;
343-
344- // First pass - process rows
345- block . TransposeInplace ( ) ;
226+ // First pass - process columns
346227 FDCT8x4_Vector4 ( ref block . V0L ) ;
347228 FDCT8x4_Vector4 ( ref block . V0R ) ;
348229
349- // Second pass - process columns
230+ // Second pass - process rows
350231 block . TransposeInplace ( ) ;
351232 FDCT8x4_Vector4 ( ref block . V0L ) ;
352233 FDCT8x4_Vector4 ( ref block . V0R ) ;
0 commit comments