Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized large SIMD butterflies for better register usage #134

Merged
merged 14 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benches/bench_rustfft_sse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ fn bench_planned_multi_f64(b: &mut Bencher, len: usize) {
#[bench] fn sse_butterfly32_17(b: &mut Bencher) { bench_planned_multi_f32(b, 17);}
#[bench] fn sse_butterfly32_19(b: &mut Bencher) { bench_planned_multi_f32(b, 19);}
#[bench] fn sse_butterfly32_23(b: &mut Bencher) { bench_planned_multi_f32(b, 23);}
#[bench] fn sse_butterfly32_24(b: &mut Bencher) { bench_planned_multi_f32(b, 24);}
#[bench] fn sse_butterfly32_29(b: &mut Bencher) { bench_planned_multi_f32(b, 29);}
#[bench] fn sse_butterfly32_31(b: &mut Bencher) { bench_planned_multi_f32(b, 31);}
#[bench] fn sse_butterfly32_32(b: &mut Bencher) { bench_planned_multi_f32(b, 32);}
Expand All @@ -97,6 +98,7 @@ fn bench_planned_multi_f64(b: &mut Bencher, len: usize) {
#[bench] fn sse_butterfly64_17(b: &mut Bencher) { bench_planned_multi_f64(b, 17);}
#[bench] fn sse_butterfly64_19(b: &mut Bencher) { bench_planned_multi_f64(b, 19);}
#[bench] fn sse_butterfly64_23(b: &mut Bencher) { bench_planned_multi_f64(b, 23);}
#[bench] fn sse_butterfly64_24(b: &mut Bencher) { bench_planned_multi_f64(b, 24);}
#[bench] fn sse_butterfly64_29(b: &mut Bencher) { bench_planned_multi_f64(b, 29);}
#[bench] fn sse_butterfly64_31(b: &mut Bencher) { bench_planned_multi_f64(b, 31);}
#[bench] fn sse_butterfly64_32(b: &mut Bencher) { bench_planned_multi_f64(b, 32);}
Expand Down
8 changes: 8 additions & 0 deletions benches/bench_rustfft_wasm_simd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ fn wasm_simd_butterfly32_23(b: &mut Bencher) {
bench_planned_multi_f32(b, 23);
}
#[bench]
fn wasm_simd_butterfly32_24(b: &mut Bencher) {
bench_planned_multi_f32(b, 24);
}
#[bench]
fn wasm_simd_butterfly32_29(b: &mut Bencher) {
bench_planned_multi_f32(b, 29);
}
Expand Down Expand Up @@ -238,6 +242,10 @@ fn wasm_simd_butterfly64_23(b: &mut Bencher) {
bench_planned_multi_f64(b, 23);
}
#[bench]
fn wasm_simd_butterfly64_24(b: &mut Bencher) {
bench_planned_multi_f64(b, 24);
}
#[bench]
fn wasm_simd_butterfly64_29(b: &mut Bencher) {
bench_planned_multi_f64(b, 29);
}
Expand Down
1,882 changes: 784 additions & 1,098 deletions src/neon/neon_butterflies.rs

Large diffs are not rendered by default.

7 changes: 1 addition & 6 deletions src/neon/neon_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ use crate::math_utils::{PrimeFactor, PrimeFactors};

const MIN_RADIX4_BITS: u32 = 6; // smallest size to consider radix 4 an option is 2^6 = 64
const MAX_RADER_PRIME_FACTOR: usize = 23; // don't use Raders if the inner fft length has prime factor larger than this
const RADIX4_USE_BUTTERFLY32_FROM: u32 = 18; // Use length 32 butterfly starting from this length

/// A Recipe is a structure that describes the design of a FFT, without actually creating it.
/// It is used as a middle step in the planning process.
Expand Down Expand Up @@ -666,11 +665,7 @@ impl<T: FftNum> FftPlannerNeon<T> {
// main case: if len is a power of 4, use a base of 16, otherwise use a base of 8
_ => {
if p2 % 2 == 1 {
if p2 >= RADIX4_USE_BUTTERFLY32_FROM {
32
} else {
8
}
32
} else {
16
}
Expand Down
42 changes: 42 additions & 0 deletions src/neon/neon_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,27 @@ impl Rotate90F32 {
vreinterpretq_u32_f32(self.sign_both),
))
}

#[inline(always)]
pub unsafe fn rotate_both_45(&self, values: float32x4_t) -> float32x4_t {
let rotated = self.rotate_both(values);
let sum = vaddq_f32(rotated, values);
vmulq_f32(sum, vmovq_n_f32(0.5f32.sqrt()))
}

#[inline(always)]
pub unsafe fn rotate_both_135(&self, values: float32x4_t) -> float32x4_t {
let rotated = self.rotate_both(values);
let diff = vsubq_f32(rotated, values);
vmulq_f32(diff, vmovq_n_f32(0.5f32.sqrt()))
}

#[inline(always)]
pub unsafe fn rotate_both_225(&self, values: float32x4_t) -> float32x4_t {
let rotated = self.rotate_both(values);
let diff = vaddq_f32(rotated, values);
vmulq_f32(diff, vmovq_n_f32(-(0.5f32.sqrt())))
}
}

// Pack low (1st) complex
Expand Down Expand Up @@ -202,6 +223,27 @@ impl Rotate90F64 {
vreinterpretq_u64_f64(self.sign),
))
}

#[inline(always)]
pub unsafe fn rotate_45(&self, values: float64x2_t) -> float64x2_t {
let rotated = self.rotate(values);
let sum = vaddq_f64(rotated, values);
vmulq_f64(sum, vmovq_n_f64(0.5f64.sqrt()))
}

#[inline(always)]
pub unsafe fn rotate_135(&self, values: float64x2_t) -> float64x2_t {
let rotated = self.rotate(values);
let diff = vsubq_f64(rotated, values);
vmulq_f64(diff, vmovq_n_f64(0.5f64.sqrt()))
}

#[inline(always)]
pub unsafe fn rotate_225(&self, values: float64x2_t) -> float64x2_t {
let rotated = self.rotate(values);
let diff = vaddq_f64(rotated, values);
vmulq_f64(diff, vmovq_n_f64(-(0.5f64.sqrt())))
}
}

#[cfg(test)]
Expand Down
13 changes: 13 additions & 0 deletions src/neon/neon_vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ pub trait NeonVector: Copy + Debug + Send + Sync {
unsafe fn store_partial_lo_complex(ptr: *mut Complex<Self::ScalarType>, data: Self);
unsafe fn store_partial_hi_complex(ptr: *mut Complex<Self::ScalarType>, data: Self);

// math ops
unsafe fn neg(a: Self) -> Self;

/// Generates a chunk of twiddle factors starting at (X,Y) and incrementing X `COMPLEX_PER_VECTOR` times.
/// The result will be [twiddle(x*y, len), twiddle((x+1)*y, len), twiddle((x+2)*y, len), ...] for as many complex numbers fit in a vector
unsafe fn make_mixedradix_twiddle_chunk(
Expand Down Expand Up @@ -212,6 +215,11 @@ impl NeonVector for float32x4_t {
vst1_f32(ptr as *mut f32, high);
}

#[inline(always)]
unsafe fn neg(a: Self) -> Self {
vnegq_f32(a)
}

#[inline(always)]
unsafe fn make_mixedradix_twiddle_chunk(
x: usize,
Expand Down Expand Up @@ -315,6 +323,11 @@ impl NeonVector for float64x2_t {
unimplemented!("Impossible to do a partial store of complex f64's");
}

#[inline(always)]
unsafe fn neg(a: Self) -> Self {
vnegq_f64(a)
}

#[inline(always)]
unsafe fn make_mixedradix_twiddle_chunk(
x: usize,
Expand Down
Loading
Loading