resampler.c: update convolution functions and recommended practices

- added comment that canonical version is not recommended (slow and inaccurate) - eliminate previous #2 and #3 because they were faster, but also less accurate - added suggested gcc/mingw compiler options for version #4 (now #2) - new 2x unrolled version that MSVC likes (thanks bennetng @ HA)
dbry · Dec 28, 2024 · 4c35493 · 4c35493
1 parent c927d0f
commit 4c35493
Showing 1 changed file with 17 additions and 29 deletions.
diff --git a/resampler.c b/resampler.c
@@ -365,11 +365,11 @@ void resampleFree (Resample *cxt)
 
 // This is the basic convolution operation that is the core of the resampler and utilizes the
 // bulk of the CPU load (assuming reasonably long filters). The first version is the canonical
-// form, followed by three variations that may or may not be faster depending on your compiler,
-// options, and system. Try 'em and use the fastest, or rewrite them using SIMD. Note that on
-// gcc and clang, -Ofast can make a huge difference.
+// form for reference, followed by two variations that are more accurate and incorporate various
+// degrees of parallelization that can be utilized by optimizing compilers. Try 'em and use the
+// fastest, or rewrite them using SIMD.
 
-#if 1   // Version 1 (canonical)
+#if 0   // Version 1 (canonical, very simple but slow and less accurate, not recommended)
 static double apply_filter (float *A, float *B, int num_taps)
 {
     float sum = 0.0;
@@ -381,46 +381,34 @@ static double apply_filter (float *A, float *B, int num_taps)
 }
 #endif
 
-#if 0   // Version 2 (2x unrolled loop)
+#if 1   // Version 2 (outside-in order, more accurate)
+        // Works well with gcc and mingw
+        // try "-O3 -mavx2 -fno-signed-zeros -fno-trapping-math -fassociative-math"
 static double apply_filter (float *A, float *B, int num_taps)
 {
-    int num_loops = num_taps >> 1;
-    float sum = 0.0;
-
-    do {
-        sum += (A[0] * B[0]) + (A[1] * B[1]);
-        A += 2; B += 2;
-    } while (--num_loops);
-
-    return sum;
-}
-#endif
-
-#if 0   // Version 3 (4x unrolled loop)
-static double apply_filter (float *A, float *B, int num_taps)
-{
-    int num_loops = num_taps >> 2;
+    int i = num_taps - 1;
     float sum = 0.0;
 
     do {
-        sum += (A[0] * B[0]) + (A[1] * B[1]) + (A[2] * B[2]) + (A[3] * B[3]);
-        A += 4; B += 4;
-    } while (--num_loops);
+        sum += (A[0] * B[0]) + (A[i] * B[i]);
+        A++; B++;
+    } while ((i -= 2) > 0);
 
     return sum;
 }
 #endif
 
-#if 0   // Version 4 (outside-in order, may be more accurate)
-static double apply_filter (float *A, float *B, int num_taps)
+#if 0   // Version 3 (outside-in order, 2x unrolled loop)
+        // Works well with MSVC, but gcc has trouble vectorizing it
+static double apply_filter(float* A, float* B, int num_taps)
 {
     int i = num_taps - 1;
     float sum = 0.0;
 
     do {
-        sum += (A[0] * B[0]) + (A[i] * B[i]);
-        A++; B++;
-    } while ((i -= 2) > 0);
+        sum += (A[0] * B[0]) + (A[i] * B[i]) + (A[1] * B[1]) + (A[i - 1] * B[i - 1]);
+        A += 2; B += 2;
+    } while ((i -= 4) > 0);
 
     return sum;
 }