Skip to content

Commit

Permalink
resampler.c: update convolution functions and recommended practices
Browse files Browse the repository at this point in the history
- added comment that canonical version is not recommended (slow and inaccurate)
- eliminate previous #2 and #3 because they were faster, but also less accurate
- added suggested gcc/mingw compiler options for version #4 (now #2)
- new 2x unrolled version that MSVC likes (thanks bennetng @ HA)
  • Loading branch information
dbry committed Dec 28, 2024
1 parent c927d0f commit 4c35493
Showing 1 changed file with 17 additions and 29 deletions.
46 changes: 17 additions & 29 deletions resampler.c
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,11 @@ void resampleFree (Resample *cxt)

// This is the basic convolution operation that is the core of the resampler and utilizes the
// bulk of the CPU load (assuming reasonably long filters). The first version is the canonical
// form, followed by three variations that may or may not be faster depending on your compiler,
// options, and system. Try 'em and use the fastest, or rewrite them using SIMD. Note that on
// gcc and clang, -Ofast can make a huge difference.
// form for reference, followed by two variations that are more accurate and incorporate various
// degrees of parallelization that can be utilized by optimizing compilers. Try 'em and use the
// fastest, or rewrite them using SIMD.

#if 1 // Version 1 (canonical)
#if 0 // Version 1 (canonical, very simple but slow and less accurate, not recommended)
static double apply_filter (float *A, float *B, int num_taps)
{
float sum = 0.0;
Expand All @@ -381,46 +381,34 @@ static double apply_filter (float *A, float *B, int num_taps)
}
#endif

#if 0 // Version 2 (2x unrolled loop)
#if 1 // Version 2 (outside-in order, more accurate)
// Works well with gcc and mingw
// try "-O3 -mavx2 -fno-signed-zeros -fno-trapping-math -fassociative-math"
static double apply_filter (float *A, float *B, int num_taps)
{
int num_loops = num_taps >> 1;
float sum = 0.0;

do {
sum += (A[0] * B[0]) + (A[1] * B[1]);
A += 2; B += 2;
} while (--num_loops);

return sum;
}
#endif

#if 0 // Version 3 (4x unrolled loop)
static double apply_filter (float *A, float *B, int num_taps)
{
int num_loops = num_taps >> 2;
int i = num_taps - 1;
float sum = 0.0;

do {
sum += (A[0] * B[0]) + (A[1] * B[1]) + (A[2] * B[2]) + (A[3] * B[3]);
A += 4; B += 4;
} while (--num_loops);
sum += (A[0] * B[0]) + (A[i] * B[i]);
A++; B++;
} while ((i -= 2) > 0);

return sum;
}
#endif

#if 0 // Version 4 (outside-in order, may be more accurate)
static double apply_filter (float *A, float *B, int num_taps)
#if 0 // Version 3 (outside-in order, 2x unrolled loop)
// Works well with MSVC, but gcc has trouble vectorizing it
static double apply_filter(float* A, float* B, int num_taps)
{
int i = num_taps - 1;
float sum = 0.0;

do {
sum += (A[0] * B[0]) + (A[i] * B[i]);
A++; B++;
} while ((i -= 2) > 0);
sum += (A[0] * B[0]) + (A[i] * B[i]) + (A[1] * B[1]) + (A[i - 1] * B[i - 1]);
A += 2; B += 2;
} while ((i -= 4) > 0);

return sum;
}
Expand Down

0 comments on commit 4c35493

Please sign in to comment.