splitting onedim_f_series in two functions

DiamonDinoia · Sep 11, 2024 · 1355818 · 1355818
1 parent 52cd6cc
commit 1355818
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 36 deletions.
diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
@@ -32,12 +32,13 @@ int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts
 
 void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
                    CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b);
-// template<typename T>
-// void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts
-// opts);
-template<typename T, bool>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,
-                                   finufft_spread_opts opts);
+
+template<typename T>
+void onedim_uniformn_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,
+                                            finufft_spread_opts opts);
+template<typename T>
+void onedim_non_uniform_fseries_kernel_precomp(T *f, T *a, finufft_spread_opts opts);
+
 template<typename T>
 void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
                                    T *fwkerhalf, finufft_spread_opts opts);

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
@@ -297,14 +297,14 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     T fseries_precomp_f[3 * MAX_NQUAD];
     thrust::device_vector<T> d_fseries_precomp_a(3 * MAX_NQUAD);
     thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
-    onedim_fseries_kernel_precomp<T, true>(d_plan->nf1, fseries_precomp_f,
-                                           fseries_precomp_a, d_plan->spopts);
+    onedim_uniformn_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
+                                              fseries_precomp_a, d_plan->spopts);
     if (d_plan->dim > 1)
-      onedim_fseries_kernel_precomp<T, true>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
-                                             fseries_precomp_a + MAX_NQUAD,
-                                             d_plan->spopts);
+      onedim_uniformn_fseries_kernel_precomp<T>(
+          d_plan->nf2, fseries_precomp_f + MAX_NQUAD, fseries_precomp_a + MAX_NQUAD,
+          d_plan->spopts);
     if (d_plan->dim > 2)
-      onedim_fseries_kernel_precomp<T, true>(
+      onedim_uniformn_fseries_kernel_precomp<T>(
           d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
           fseries_precomp_a + 2 * MAX_NQUAD, d_plan->spopts);
     // copy the precomputed data to the device using thrust
@@ -687,17 +687,18 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
     if (d_plan->dim > 2) {
       phi_hat3.resize(N);
     }
-    onedim_fseries_kernel_precomp<T, false>(0, fseries_precomp_f.data(),
-                                            fseries_precomp_a.data(), d_plan->spopts);
+    onedim_non_uniform_fseries_kernel_precomp<T>(
+        fseries_precomp_f.data(), fseries_precomp_a.data(), d_plan->spopts);
     if (d_plan->dim > 1) {
-      onedim_fseries_kernel_precomp<T, false>(0, fseries_precomp_f.data() + MAX_NQUAD,
-                                              fseries_precomp_a.data() + MAX_NQUAD,
-                                              d_plan->spopts);
+      onedim_non_uniform_fseries_kernel_precomp<T>(fseries_precomp_f.data() + MAX_NQUAD,
+                                                   fseries_precomp_a.data() + MAX_NQUAD,
+                                                   d_plan->spopts);
     }
     if (d_plan->dim > 2) {
-      onedim_fseries_kernel_precomp<T, false>(0, fseries_precomp_f.data() + 2 * MAX_NQUAD,
-                                              fseries_precomp_a.data() + 2 * MAX_NQUAD,
-                                              d_plan->spopts);
+      onedim_non_uniform_fseries_kernel_precomp<T>(
+          fseries_precomp_f.data() + 2 * MAX_NQUAD,
+          fseries_precomp_a.data() + 2 * MAX_NQUAD,
+          d_plan->spopts);
     }
     // copy the precomputed data to the device using thrust
     thrust::copy(fseries_precomp_a.begin(), fseries_precomp_a.end(),

diff --git a/src/cuda/common.cu b/src/cuda/common.cu
@@ -196,25 +196,39 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts
   f - funciton values at quadrature nodes multiplied with quadrature weights (a, f are
       provided as the inputs of onedim_fseries_kernel_compute() defined below)
 */
-template<typename T, bool phase_winding = true>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,
-                                   finufft_spread_opts opts) {
+
+template<typename T>
+void onedim_uniformn_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,
+                                            finufft_spread_opts opts) {
+  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
+  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
+  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot
+  // exceed MAX_NQUAD
+  double z[2 * MAX_NQUAD];
+  double w[2 * MAX_NQUAD];
+  finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used,
+  // eg on (0,1)
+  for (int n = 0; n < q; ++n) {                        // set up nodes z_n and vals f_n
+    z[n] *= J2;                                        // rescale nodes
+    f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts); // vals & quadr wei
+    a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates
+  }
+}
+
+template<typename T>
+void onedim_non_uniform_fseries_kernel_precomp(T *f, T *a, finufft_spread_opts opts) {
   T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
   // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q = (int)(2 + (phase_winding ? 3.0 : 2.0) * J2); // not sure why so large? cannot
-                                                       // exceed MAX_NQUAD
+  int q = (int)(2 + 2.0 * J2); // not sure why so large? cannot
+                               // exceed MAX_NQUAD
   double z[2 * MAX_NQUAD];
   double w[2 * MAX_NQUAD];
   finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used,
                                                           // eg on (0,1)
   for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
     z[n] *= J2;                                           // rescale nodes
     f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts);    // vals & quadr wei
-    if constexpr (phase_winding) {
-      a[n] = ((T)(2.0 * M_PI) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates
-    } else {
-      a[n] = T(z[n]);
-    }
+    a[n] = T(z[n]);
   }
 }
 
@@ -335,14 +349,14 @@ template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps,
                                       cufinufft_opts opts);
 template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps,
                                       cufinufft_opts opts);
-template void onedim_fseries_kernel_precomp<float, true>(
-    CUFINUFFT_BIGINT nf, float *f, float *a, finufft_spread_opts opts);
-template void onedim_fseries_kernel_precomp<double, true>(
-    CUFINUFFT_BIGINT nf, double *f, double *a, finufft_spread_opts opts);
-template void onedim_fseries_kernel_precomp<float, false>(
+template void onedim_uniformn_fseries_kernel_precomp<float>(
     CUFINUFFT_BIGINT nf, float *f, float *a, finufft_spread_opts opts);
-template void onedim_fseries_kernel_precomp<double, false>(
+template void onedim_uniformn_fseries_kernel_precomp<double>(
     CUFINUFFT_BIGINT nf, double *f, double *a, finufft_spread_opts opts);
+template void onedim_non_uniform_fseries_kernel_precomp<float>(float *f, float *a,
+                                                               finufft_spread_opts opts);
+template void onedim_non_uniform_fseries_kernel_precomp<double>(double *f, double *a,
+                                                                finufft_spread_opts opts);
 template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f,
                                     float *d_a, float *d_fwkerhalf1, float *d_fwkerhalf2,
                                     float *d_fwkerhalf3, int ns, cudaStream_t stream);