Addressed review comments

DiamonDinoia · Sep 4, 2024 · d3d4d34 · d3d4d34
1 parent 9b0da66
commit d3d4d34
Show file tree

Hide file tree

Showing 19 changed files with 98 additions and 67 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -3,12 +3,12 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
 V 2.4.0 (08/28/24)
 * Support for type 3 in 1D, 2D, and 3D in the GPU library cufinufft (PR #517).
-* Removed the CPU fseries computation (only used for benchmark no longer needed).
-* Added complex arithmetic support for cuda_complex type
-* Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
-* Minor fixes on the GPU code:
-    - removed memory leaks in case of errors
-    - renamed maxbatchsize to batchsize
+    - Removed the CPU fseries computation (only used for benchmark no longer needed).
+    - Added complex arithmetic support for cuda_complex type
+    - Added tests for type 3 in 1D, 2D, and 3D and cuda_complex arithmetic
+    - Minor fixes on the GPU code:
+        a) removed memory leaks in case of errors
+        b) renamed maxbatchsize to batchsize
 
 V 2.3.0-rc1 (8/6/24)
 

diff --git a/docs/devnotes.rst b/docs/devnotes.rst
@@ -51,7 +51,7 @@ Developer notes
 
 * CMake compiling on linux at Flatiron Institute (Rusty cluster): We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.
 
-* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10, for ns<10, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue.
+* Note to the nvcc developer. nvcc with debug symbols causes a stack overflow that is undetected at both compile and runtime. This goes undetected until ns>=10 and dim=3, for ns<10 or dim < 3, one can use -G and debug the code with cuda-gdb. The way to avoid is to not use Debug symbols, possibly using ``--generate-line-info`` might work (not tested). As a side note, compute-sanitizers do not detect the issue.
 
 * Testing cufinufft (for FI, mostly):
 

diff --git a/docs/opts.rst b/docs/opts.rst
@@ -183,7 +183,7 @@ for only two settings, as follows. Otherwise, setting it to zero chooses a good
     Historical note: A former option ``3`` has been removed. This was like ``2`` except allowing nested OMP parallelism, so multi-threaded spread-interpolate was used for each of the vectors in a batch in parallel. This was used by Andrea Malleo in 2019. We have not yet found a case where this beats both ``1`` and ``2``, hence removed it due to complications with changing the OMP nesting state in both old and new OMP versions.
 
 
-**batchsize**:  in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), set the largest batch size of data vectors.
+**maxbatchsize**:  in the case of multiple transforms per call (``ntr>1``, or the "many" interfaces), set the largest batch size of data vectors.
 Here ``0`` makes an automatic choice. If you are unhappy with this, then for small problems it should equal the number of threads, while for large problems it appears that ``1`` often better (since otherwise too much simultaneous RAM movement occurs). Some further work is needed to optimize this parameter.
 
 **spread_nthr_atomic**: if non-negative: for numbers of threads up to this value, an OMP critical block for ``add_wrapped_subgrid`` is used in spreading (type 1 transforms). Above this value, instead OMP atomic writes are used, which scale better for large thread numbers. If negative, the heuristic default in the spreader is used, set in ``src/spreadinterp.cpp:setup_spreader()``.

diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
@@ -8,7 +8,6 @@
 #include <finufft_spread_opts.h>
 
 #include <complex>
-#include <optional>
 
 namespace cufinufft {
 namespace common {

diff --git a/include/cufinufft/contrib/helper_math.h b/include/cufinufft/contrib/helper_math.h
@@ -3,6 +3,11 @@
 
 #include <cuComplex.h>
 
+// This header provides some helper functions for cuComplex types.
+// It mainly wraps existing CUDA implementations to provide operator overloads
+// e.g. cuAdd, cuSub, cuMul, cuDiv, cuCreal, cuCimag, cuCabs, cuCarg, cuConj are all
+// provided by CUDA
+
 // Addition for cuDoubleComplex (double) with cuDoubleComplex (double)
 __host__ __device__ __forceinline__ cuDoubleComplex operator+(
     const cuDoubleComplex &a, const cuDoubleComplex &b) noexcept {

diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
@@ -1,7 +1,6 @@
 #ifndef CUFINUFFT_DEFS_H
 #define CUFINUFFT_DEFS_H
 
-#include <complex>
 #include <limits>
 // constants needed within common
 // upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for

diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
@@ -116,11 +116,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   d_plan->iflag   = fftsign;
   d_plan->ntransf = ntransf;
 
-  int maxbatchsize = (opts != nullptr) ? opts->gpu_maxbatchsize : 0;
+  int batchsize = (opts != nullptr) ? opts->gpu_maxbatchsize : 0;
   // TODO: check if this is the right heuristic
-  if (maxbatchsize == 0)                 // implies: use a heuristic.
-    maxbatchsize = std::min(ntransf, 8); // heuristic from test codes
-  d_plan->batchsize = maxbatchsize;
+  if (batchsize == 0)                 // implies: use a heuristic.
+    batchsize = std::min(ntransf, 8); // heuristic from test codes
+  d_plan->batchsize = batchsize;
 
   const auto stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
 
@@ -262,23 +262,23 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       int inembed[] = {(int)nf1};
 
       cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
-                                   inembed[0], cufft_type<T>(), maxbatchsize);
+                                   inembed[0], cufft_type<T>(), batchsize);
     } break;
     case 2: {
       int n[]       = {(int)nf2, (int)nf1};
       int inembed[] = {(int)nf2, (int)nf1};
 
       cufft_status =
           cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
-                        inembed[0] * inembed[1], cufft_type<T>(), maxbatchsize);
+                        inembed[0] * inembed[1], cufft_type<T>(), batchsize);
     } break;
     case 3: {
       int n[]       = {(int)nf3, (int)nf2, (int)nf1};
       int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
 
       cufft_status = cufftPlanMany(
           &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
-          inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), maxbatchsize);
+          inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
     } break;
     }
 
@@ -292,6 +292,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
 
     d_plan->fftplan = fftplan;
 
+    // compute up to 3 * NQUAD precomputed values on CPU
     T fseries_precomp_a[3 * MAX_NQUAD];
     T fseries_precomp_f[3 * MAX_NQUAD];
     thrust::device_vector<T> d_fseries_precomp_a(3 * MAX_NQUAD);
@@ -311,6 +312,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
                  d_fseries_precomp_a.begin());
     thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
                  d_fseries_precomp_f.begin());
+    // the full fseries is done on the GPU here
     if ((ier = cufserieskernelcompute(
              d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
              d_fseries_precomp_f.data().get(), d_fseries_precomp_a.data().get(),
@@ -446,6 +448,12 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
     return cufinufft_setpts_12_impl<T>(M, d_kx, d_ky, d_kz, d_plan);
   }
   // type 3 setpts
+
+  // This code follows the same implementation of the CPU code in finufft and uses similar
+  // variables names where possible. However, the use of GPU routines and paradigms make
+  // it harder to follow. To understand the code, it is recommended to read the CPU code
+  // first.
+
   if (d_plan->type != 3) {
     fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__,
             d_plan->type);
@@ -744,8 +752,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
           thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
           d_plan->deconv, d_plan->deconv,
           [c1, c2, c3, d1, d2, d3, realsign] __host__ __device__(
-              const thrust::tuple<T, T, T> tuple,
-              cuda_complex<T> deconv) -> cuda_complex<T> {
+              const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
+              -> cuda_complex<T> {
             // d2 and d3 are 0 if dim < 2 and dim < 3
             const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
                                c2 * (thrust::get<1>(tuple) + d2) +
@@ -861,8 +869,6 @@ int cufinufft_destroy_impl(cufinufft_plan_t<T> *d_plan)
     In this stage, we
         (1) free all the memories that have been allocated on gpu
         (2) delete the cuFFT plan
-
-        Also see ../docs/cppdoc.md for main user-facing documentation.
 */
 {
 

diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
@@ -23,6 +23,10 @@ static __forceinline__ __device__ constexpr T cudaFMA(const T a, const T b, cons
   return std::fma(a, b, c);
 }
 
+/**
+ * local NU coord fold+rescale macro: does the following affine transform to x:
+ *   (x+PI) mod PI    each to [0,N)
+ */
 template<typename T>
 constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) {
   constexpr auto x2pi = T(0.159154943091895345554011992339482617);
@@ -92,8 +96,8 @@ static __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
    This is the current evaluation method, since it's faster (except i7 w=16).
    Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
 {
+  // const T z = T(2) * x + T(w - 1);
   const auto z = cudaFMA(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1]
-                                             //   const T z = T(2) * x + T(w - 1);
   // insert the auto-generated code which expects z, w args, writes to ker...
   if (upsampfac == 2.0) { // floating point equality is fine here
     using FLT = T;

diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
@@ -14,6 +14,10 @@
 
 // Marco Barbone 8/5/2924, replaced the ugly trick with std::conditional
 // to define cuda_complex
+// by using std::conditional and std::is_same, we can define cuda_complex
+// if T is float, cuda_complex<T> is cuFloatComplex
+// if T is double, cuda_complex<T> is cuDoubleComplex
+// where cuFloatComplex and cuDoubleComplex are defined in cuComplex.h
 // TODO: migrate to cuda/std/complex and remove this
 //       Issue: cufft seems not to support cuda::std::complex
 //       A reinterpret_cast should be enough
@@ -61,7 +65,8 @@ template<typename T> struct cufinufft_plan_t {
 
   // Type 3 specific
   struct {
-    T X1, C1, S1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale
+    T X1, C1, S1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale,
+                                // s=interval
     T X2, C2, S2, D2, h2, gam2; // y
     T X3, C3, S3, D3, h3, gam3; // z
   } type3_params;

diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
@@ -35,6 +35,10 @@ __inline__ __device__ double atomicAdd(double *address, double val) {
 }
 #endif
 
+/**
+ * It computes the stard and end point of the spreading window given the center x and the
+ * width ns.
+ */
 template<typename T> __forceinline__ __device__ auto interval(const int ns, const T x) {
   const auto xstart = int(std::ceil(x - T(ns) * T(.5)));
   const auto xend   = int(std::floor(x + T(ns) * T(.5)));
@@ -114,8 +118,8 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {
  */
 
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexShared(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   const auto raw_address = reinterpret_cast<T *>(address);
   atomicAdd(raw_address, res.x);
   atomicAdd(raw_address + 1, res.y);
@@ -127,8 +131,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(
  * on shared memory are supported so we leverage them
  */
 template<typename T>
-static __forceinline__ __device__ void atomicAddComplexGlobal(
-    cuda_complex<T> *address, cuda_complex<T> res) {
+static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
+                                                              cuda_complex<T> res) {
   if constexpr (
       std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
     atomicAdd(address, res);
@@ -168,18 +172,10 @@ template<typename T> auto arraywidcen(int n, T *a, cudaStream_t stream) {
 template<typename T>
 auto set_nhg_type3(T S, T X, const cufinufft_opts &opts,
                    const finufft_spread_opts &spopts)
-/* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor),
-   for type 3 only.
-   Inputs:
-   X and S are the xj and sk interval half-widths respectively.
-   opts and spopts are the NUFFT and spreader opts strucs, respectively.
-   Outputs:
-   nf is the size of upsampled grid for a given single dimension.
-   h is the grid spacing = 2pi/nf
-   gam is the x rescale factor, ie x'_j = x_j/gam  (modulo shifts).
-   Barnett 2/13/17. Caught inf/nan 3/14/17. io int types changed 3/28/17
-   New logic 6/12/17
-*/
+/*
+ * It implements the same function in finufft.cpp
+ * set_nhg_type3 in finufft.cpp for documentation
+ */
 {
   int nss = spopts.nspread + 1; // since ns may be odd
   T Xsafe = X, Ssafe = S;       // may be tweaked locally

diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
@@ -121,13 +121,13 @@ template<typename T>
 int cufinufft1d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
                       cufinufft_plan_t<T> *d_plan) {
   /*
-    3D Type-3 NUFFT
+    1D Type-3 NUFFT
 
   This function is called in "exec" stage (See ../cufinufft.cu).
   It includes (copied from doc in finufft library)
     Step 0: pre-phase the input strengths
     Step 1: spread data
-    Step 2: Type 3 NUFFT
+    Step 2: Type 2 NUFFT
     Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff
 
   Marco Barbone 08/14/2024
@@ -159,7 +159,7 @@ int cufinufft1d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
     // now d_plan->fk = d_plan->fw contains the spread values
-    // Step 2: Type 3 NUFFT
+    // Step 2: Type 2 NUFFT
     // type 2 goes from fk to c
     // saving the results directly in the user output array d_fk
     // it needs to do blksize transforms

diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
@@ -123,13 +123,13 @@ template<typename T>
 int cufinufft2d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
                       cufinufft_plan_t<T> *d_plan) {
   /*
-    3D Type-3 NUFFT
+    2D Type-3 NUFFT
 
   This function is called in "exec" stage (See ../cufinufft.cu).
   It includes (copied from doc in finufft library)
     Step 0: pre-phase the input strengths
     Step 1: spread data
-    Step 2: Type 3 NUFFT
+    Step 2: Type 2 NUFFT
     Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff
 
   Marco Barbone 08/14/2024
@@ -160,7 +160,7 @@ int cufinufft2d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
     // now d_plan->fk = d_plan->fw contains the spread values
-    // Step 2: Type 3 NUFFT
+    // Step 2: Type 2 NUFFT
     // type 2 goes from fk to c
     // saving the results directly in the user output array d_fk
     // it needs to do blksize transforms

diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
@@ -125,7 +125,7 @@ int cufinufft3d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
   It includes (copied from doc in finufft library)
     Step 0: pre-phase the input strengths
     Step 1: spread data
-    Step 2: Type 3 NUFFT
+    Step 2: Type 2 NUFFT
     Step 3: deconvolve (amplify) each Fourier mode, using kernel Fourier coeff
 
   Marco Barbone 08/14/2024
@@ -157,7 +157,7 @@ int cufinufft3d3_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
     // Step 1: Spread
     if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
     // now d_plan->fk = d_plan->fw contains the spread values
-    // Step 2: Type 3 NUFFT
+    // Step 2: Type 2 NUFFT
     // type 2 goes from fk to c
     // saving the results directly in the user output array d_fk
     // it needs to do blksize transforms

diff --git a/src/cuda/common.cu b/src/cuda/common.cu
@@ -21,10 +21,13 @@ namespace common {
 using namespace cufinufft::spreadinterp;
 using std::max;
 
-/* Kernel for computing approximations of exact Fourier series coeffs of
-   cnufftspread's real symmetric kernel. */
-// a , f are intermediate results from function onedim_fseries_kernel_precomp()
-// (see cufinufft/contrib/common.cpp for description)
+/** Kernel for computing approximations of exact Fourier series coeffs of
+ *  cnufftspread's real symmetric kernel.
+ * a , f are intermediate results from function onedim_fseries_kernel_precomp()
+ * (see cufinufft/contrib/common.cpp for description)
+ * this is the equispaced frequency case, used by type 1 & 2, matching
+ * onedim_fseries_kernel in CPU code
+ */
 template<typename T>
 __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a,
                                        T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3,
@@ -60,6 +63,13 @@ __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a,
   }
 }
 
+/** Kernel for computing approximations of exact Fourier series coeffs of
+ *  cnufftspread's real symmetric kernel.
+ * a , f are intermediate results from function onedim_fseries_kernel_precomp()
+ * (see cufinufft/contrib/common.cpp for description)
+ * this is the arbitrary frequency case (hence the extra kx, ky, kx arguments), used by
+ * type 3, matching onedim_nuft_kernel in CPU code
+ */
 template<typename T>
 __global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, T *a, T *kx,
                                        T *ky, T *kz, T *fwkerhalf1, T *fwkerhalf2,
@@ -129,6 +139,7 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, T *d_a, T
     narrowness of kernel. Evaluates at set of arbitrary freqs k in [-pi, pi),
     for a kernel with x measured in grid-spacings. (See previous routine for
     FT definition).
+    It implements onedim_nuft_kernel in CPU code.
 
     Marco Barbone 08/28/2024
 */
@@ -177,13 +188,13 @@ void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts
   Inputs:
   nf - size of 1d uniform spread grid, must be even.
   opts - spreading opts object, needed to eval kernel (must be already set up)
-  phase_winding - if true, compute normalization factors for phase winding rates,
-                  otherwise compute scaled quadrature nodes
+  phase_winding - if true (type 1-2), scaling for the equispaced case else (type 3)
+                  scaling for the general kx,ky,kz case
 
   Outputs:
-  a - normalization factors if phase winding is true, otherwise scaled quadrature nodes
-  f - funciton values at quadrature nodes multiplied with quadrature weights
-  (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below)
+  a - vector of scaled quadrature nodes;
+  f - funciton values at quadrature nodes multiplied with quadrature weights (a, f are
+      provided as the inputs of onedim_fseries_kernel_compute() defined below)
 */
 template<typename T, bool phase_winding = true>
 void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, T *a,