ejmahler · ejmahler · Feb 27, 2024 · Feb 19, 2024 · Feb 19, 2024 · Feb 19, 2024
diff --git a/benches/bench_rustfft_sse.rs b/benches/bench_rustfft_sse.rs
@@ -76,6 +76,7 @@ fn bench_planned_multi_f64(b: &mut Bencher, len: usize) {
 #[bench] fn sse_butterfly32_17(b: &mut Bencher) { bench_planned_multi_f32(b, 17);}
 #[bench] fn sse_butterfly32_19(b: &mut Bencher) { bench_planned_multi_f32(b, 19);}
 #[bench] fn sse_butterfly32_23(b: &mut Bencher) { bench_planned_multi_f32(b, 23);}
+#[bench] fn sse_butterfly32_24(b: &mut Bencher) { bench_planned_multi_f32(b, 24);}
 #[bench] fn sse_butterfly32_29(b: &mut Bencher) { bench_planned_multi_f32(b, 29);}
 #[bench] fn sse_butterfly32_31(b: &mut Bencher) { bench_planned_multi_f32(b, 31);}
 #[bench] fn sse_butterfly32_32(b: &mut Bencher) { bench_planned_multi_f32(b, 32);}
@@ -97,6 +98,7 @@ fn bench_planned_multi_f64(b: &mut Bencher, len: usize) {
 #[bench] fn sse_butterfly64_17(b: &mut Bencher) { bench_planned_multi_f64(b, 17);}
 #[bench] fn sse_butterfly64_19(b: &mut Bencher) { bench_planned_multi_f64(b, 19);}
 #[bench] fn sse_butterfly64_23(b: &mut Bencher) { bench_planned_multi_f64(b, 23);}
+#[bench] fn sse_butterfly64_24(b: &mut Bencher) { bench_planned_multi_f64(b, 24);}
 #[bench] fn sse_butterfly64_29(b: &mut Bencher) { bench_planned_multi_f64(b, 29);}
 #[bench] fn sse_butterfly64_31(b: &mut Bencher) { bench_planned_multi_f64(b, 31);}
 #[bench] fn sse_butterfly64_32(b: &mut Bencher) { bench_planned_multi_f64(b, 32);}

diff --git a/benches/bench_rustfft_wasm_simd.rs b/benches/bench_rustfft_wasm_simd.rs
@@ -157,6 +157,10 @@ fn wasm_simd_butterfly32_23(b: &mut Bencher) {
     bench_planned_multi_f32(b, 23);
 }
 #[bench]
+fn wasm_simd_butterfly32_24(b: &mut Bencher) {
+    bench_planned_multi_f32(b, 24);
+}
+#[bench]
 fn wasm_simd_butterfly32_29(b: &mut Bencher) {
     bench_planned_multi_f32(b, 29);
 }
@@ -238,6 +242,10 @@ fn wasm_simd_butterfly64_23(b: &mut Bencher) {
     bench_planned_multi_f64(b, 23);
 }
 #[bench]
+fn wasm_simd_butterfly64_24(b: &mut Bencher) {
+    bench_planned_multi_f64(b, 24);
+}
+#[bench]
 fn wasm_simd_butterfly64_29(b: &mut Bencher) {
     bench_planned_multi_f64(b, 29);
 }

diff --git a/src/neon/neon_butterflies.rs b/src/neon/neon_butterflies.rs
diff --git a/src/neon/neon_planner.rs b/src/neon/neon_planner.rs
@@ -16,7 +16,6 @@ use crate::math_utils::{PrimeFactor, PrimeFactors};
 
 const MIN_RADIX4_BITS: u32 = 6; // smallest size to consider radix 4 an option is 2^6 = 64
 const MAX_RADER_PRIME_FACTOR: usize = 23; // don't use Raders if the inner fft length has prime factor larger than this
-const RADIX4_USE_BUTTERFLY32_FROM: u32 = 18; // Use length 32 butterfly starting from this length
 
 /// A Recipe is a structure that describes the design of a FFT, without actually creating it.
 /// It is used as a middle step in the planning process.
@@ -666,11 +665,7 @@ impl<T: FftNum> FftPlannerNeon<T> {
                 // main case: if len is a power of 4, use a base of 16, otherwise use a base of 8
                 _ => {
                     if p2 % 2 == 1 {
-                        if p2 >= RADIX4_USE_BUTTERFLY32_FROM {
-                            32
-                        } else {
-                            8
-                        }
+                        32
                     } else {
                         16
                     }

diff --git a/src/neon/neon_utils.rs b/src/neon/neon_utils.rs
@@ -71,6 +71,27 @@ impl Rotate90F32 {
             vreinterpretq_u32_f32(self.sign_both),
         ))
     }
+
+    #[inline(always)]
+    pub unsafe fn rotate_both_45(&self, values: float32x4_t) -> float32x4_t {
+        let rotated = self.rotate_both(values);
+        let sum = vaddq_f32(rotated, values);
+        vmulq_f32(sum, vmovq_n_f32(0.5f32.sqrt()))
+    }
+
+    #[inline(always)]
+    pub unsafe fn rotate_both_135(&self, values: float32x4_t) -> float32x4_t {
+        let rotated = self.rotate_both(values);
+        let diff = vsubq_f32(rotated, values);
+        vmulq_f32(diff, vmovq_n_f32(0.5f32.sqrt()))
+    }
+
+    #[inline(always)]
+    pub unsafe fn rotate_both_225(&self, values: float32x4_t) -> float32x4_t {
+        let rotated = self.rotate_both(values);
+        let diff = vaddq_f32(rotated, values);
+        vmulq_f32(diff, vmovq_n_f32(-(0.5f32.sqrt())))
+    }
 }
 
 // Pack low (1st) complex
@@ -202,6 +223,27 @@ impl Rotate90F64 {
             vreinterpretq_u64_f64(self.sign),
         ))
     }
+
+    #[inline(always)]
+    pub unsafe fn rotate_45(&self, values: float64x2_t) -> float64x2_t {
+        let rotated = self.rotate(values);
+        let sum = vaddq_f64(rotated, values);
+        vmulq_f64(sum, vmovq_n_f64(0.5f64.sqrt()))
+    }
+
+    #[inline(always)]
+    pub unsafe fn rotate_135(&self, values: float64x2_t) -> float64x2_t {
+        let rotated = self.rotate(values);
+        let diff = vsubq_f64(rotated, values);
+        vmulq_f64(diff, vmovq_n_f64(0.5f64.sqrt()))
+    }
+
+    #[inline(always)]
+    pub unsafe fn rotate_225(&self, values: float64x2_t) -> float64x2_t {
+        let rotated = self.rotate(values);
+        let diff = vaddq_f64(rotated, values);
+        vmulq_f64(diff, vmovq_n_f64(-(0.5f64.sqrt())))
+    }
 }
 
 #[cfg(test)]

diff --git a/src/neon/neon_vector.rs b/src/neon/neon_vector.rs
@@ -147,6 +147,9 @@ pub trait NeonVector: Copy + Debug + Send + Sync {
     unsafe fn store_partial_lo_complex(ptr: *mut Complex<Self::ScalarType>, data: Self);
     unsafe fn store_partial_hi_complex(ptr: *mut Complex<Self::ScalarType>, data: Self);
 
+    // math ops
+    unsafe fn neg(a: Self) -> Self;
+
     /// Generates a chunk of twiddle factors starting at (X,Y) and incrementing X `COMPLEX_PER_VECTOR` times.
     /// The result will be [twiddle(x*y, len), twiddle((x+1)*y, len), twiddle((x+2)*y, len), ...] for as many complex numbers fit in a vector
     unsafe fn make_mixedradix_twiddle_chunk(
@@ -212,6 +215,11 @@ impl NeonVector for float32x4_t {
         vst1_f32(ptr as *mut f32, high);
     }
 
+    #[inline(always)]
+    unsafe fn neg(a: Self) -> Self {
+        vnegq_f32(a)
+    }
+
     #[inline(always)]
     unsafe fn make_mixedradix_twiddle_chunk(
         x: usize,
@@ -315,6 +323,11 @@ impl NeonVector for float64x2_t {
         unimplemented!("Impossible to do a partial store of complex f64's");
     }
 
+    #[inline(always)]
+    unsafe fn neg(a: Self) -> Self {
+        vnegq_f64(a)
+    }
+
     #[inline(always)]
     unsafe fn make_mixedradix_twiddle_chunk(
         x: usize,