From 5bce672c705e0791169f4d4711a033cbd6706997 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Thu, 6 Jun 2024 18:02:39 -0400
Subject: [PATCH] Reformatting

---
 CMakeLists.txt                            |   86 +-
 CMakePresets.json                         |  321 +-
 contrib/legendre_rule_fast.cpp            |  274 +-
 contrib/legendre_rule_fast.h              |    8 +-
 devel/eval_ker_expts.cpp                  |   49 +-
 devel/eval_ker_expts2.cpp                 |   62 +-
 devel/eval_ker_expts_libin_simd64.cpp     |   75 +-
 devel/eval_ker_expts_ludvig.cpp           |   76 +-
 devel/foldrescale.cpp                     |  151 +-
 devel/foldrescale_perf.cpp                |   83 +-
 devel/foldrescale_perf2.cpp               |  317 +-
 devel/interp_square_nowrap.cpp            |   38 +-
 devel/padding.cpp                         |  257 +-
 devel/test_ker_ppval.cpp                  |  159 +-
 devel/time2d2interp.cpp                   |  102 +-
 examples/cuda/example2d1many.cpp          |  176 +-
 examples/cuda/example2d2many.cpp          |  193 +-
 examples/cuda/getting_started.cpp         |  127 +-
 examples/guru1d1.cpp                      |   73 +-
 examples/guru1d1f.cpp                     |   73 +-
 examples/guru2d1.cpp                      |   87 +-
 examples/gurumany1d1.cpp                  |   79 +-
 examples/many1d1.cpp                      |   72 +-
 examples/simple1d1.cpp                    |   61 +-
 examples/simple1d1f.cpp                   |   60 +-
 examples/simple2d1.cpp                    |   85 +-
 examples/simulplans1d1.cpp                |   87 +-
 examples/threadsafe1d1.cpp                |   83 +-
 examples/threadsafe2d2f.cpp               |   41 +-
 fortran/finufftfort.cpp                   |  227 +-
 include/cufinufft.h                       |   18 +-
 include/cufinufft/common.h                |   31 +-
 include/cufinufft/contrib/helper_cuda.h   |  124 +-
 include/cufinufft/cudeconvolve.h          |   47 +-
 include/cufinufft/defs.h                  |   15 +-
 include/cufinufft/impl.h                  |  688 ++--
 include/cufinufft/memtransfer.h           |   21 +-
 include/cufinufft/precision_independent.h |   47 +-
 include/cufinufft/spreadinterp.h          |  141 +-
 include/cufinufft/types.h                 |  141 +-
 include/cufinufft/utils.h                 |   63 +-
 include/cufinufft_opts.h                  |   34 +-
 include/finufft.h                         |    5 +-
 include/finufft/dirft.h                   |   22 +-
 include/finufft/fftw_defs.h               |   52 +-
 include/finufft/spreadinterp.h            |   60 +-
 include/finufft/test_defs.h               |   10 +-
 include/finufft/utils.h                   |   25 +-
 include/finufft/utils_precindep.h         |   51 +-
 include/finufft_eitherprec.h              |  164 +-
 include/finufft_opts.h                    |   31 +-
 include/finufft_spread_opts.h             |   30 +-
 matlab/finufft.cpp                        | 3626 ++++++++++-----------
 perftest/big2d2f.cpp                      |   30 +-
 perftest/cuda/cuperftest.cu               |  485 ++-
 perftest/guru_timing_test.cpp             |  468 +--
 perftest/manysmallprobs.cpp               |   82 +-
 perftest/spreadtestnd.cpp                 |  351 +-
 src/cuda/1d/cufinufft1d.cu                |  137 +-
 src/cuda/1d/interp1d_wrapper.cu           |   98 +-
 src/cuda/1d/spread1d_wrapper.cu           |  409 +--
 src/cuda/2d/cufinufft2d.cu                |  138 +-
 src/cuda/2d/interp2d_wrapper.cu           |  229 +-
 src/cuda/2d/spread2d_wrapper.cu           |  457 +--
 src/cuda/3d/cufinufft3d.cu                |  132 +-
 src/cuda/3d/interp3d_wrapper.cu           |  250 +-
 src/cuda/3d/spread3d_wrapper.cu           |  946 +++---
 src/cuda/common.cu                        |  229 +-
 src/cuda/cufinufft.cu                     |  131 +-
 src/cuda/deconvolve_wrapper.cu            |  320 +-
 src/cuda/memtransfer_wrapper.cu           |  601 ++--
 src/cuda/precision_independent.cu         |  359 +-
 src/cuda/spreadinterp.cpp                 |  118 +-
 src/cuda/utils.cpp                        |   45 +-
 src/finufft.cpp                           | 1174 +++----
 src/ker_horner_allw_loop_constexpr.h      | 1129 +++++--
 src/simpleinterfaces.cpp                  |  299 +-
 src/utils.cpp                             |   64 +-
 src/utils_precindep.cpp                   |   50 +-
 test/basicpassfail.cpp                    |   55 +-
 test/cuda/cufinufft1d_test.cu             |  361 +-
 test/cuda/cufinufft2d1nupts_test.cu       |  404 +--
 test/cuda/cufinufft2d_test.cu             |  361 +-
 test/cuda/cufinufft2dmany_test.cu         |  385 +--
 test/cuda/cufinufft3d_test.cu             |  388 +--
 test/cuda/fseries_kernel_test.cu          |  263 +-
 test/directft/dirft1d.cpp                 |   51 +-
 test/directft/dirft2d.cpp                 |   80 +-
 test/directft/dirft3d.cpp                 |  106 +-
 test/dumbinputs.cpp                       |  718 ++--
 test/finufft1d_test.cpp                   |  248 +-
 test/finufft1dmany_test.cpp               |  315 +-
 test/finufft2d_test.cpp                   |  260 +-
 test/finufft2dmany_test.cpp               |  342 +-
 test/finufft3d_test.cpp                   |  282 +-
 test/finufft3dmany_test.cpp               |  363 ++-
 test/testutils.cpp                        |   48 +-
 97 files changed, 11922 insertions(+), 10867 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e4888955b..1626ad35e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,22 +5,22 @@ project(finufft VERSION 2.2.0 LANGUAGES C CXX)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 set(GNU_LIKE_FRONTENDS AppleClang Clang GNU)
-if(CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
+if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
     # Set custom compiler flags for gcc-compatible compilers
     set(FINUFFT_CXX_FLAGS_RELEASE -funroll-loops -ffp-contract=fast)
     set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO -O3 -g -DNDEBUG ${FINUFFT_CXX_FLAGS_RELEASE})
-endif()
+endif ()
 
 include(CTest)
 
 if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
-    if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64"))
+    if (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64"))
         # PowerPC arch does not have -march flag.
         set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.")
-    else()
+    else ()
         set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.")
-    endif()
-endif()
+    endif ()
+endif ()
 set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
 set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")
 
@@ -38,16 +38,16 @@ option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfi
 option(FINUFFT_BUILD_DEVEL "Whether to build development executables" OFF)
 # sphinx tag (don't remove): @cmake_opts_end
 
-if(FINUFFT_USE_CPU)
+if (FINUFFT_USE_CPU)
     # suppress Windows warnings about "unsafe" functions
-    if(WIN32)
+    if (WIN32)
         add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-    endif()
+    endif ()
 
     # make apple with gnu use old linker, new linker breaks, see issue #360
-    if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+    if ((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
         add_link_options("-ld64")
-    endif()
+    endif ()
 
     set(CPM_DOWNLOAD_VERSION 0.38.0)
     set(FFTW_VERSION 3.3.10)
@@ -56,7 +56,7 @@ if(FINUFFT_USE_CPU)
     include(cmake/setupCPM.cmake)
     include(cmake/setupFFTW.cmake)
     include(cmake/setupXSIMD.cmake)
-endif()
+endif ()
 
 if (FINUFFT_BUILD_MATLAB)
     # When building for matlab, we will fetch the OpenMP library used by matlab
@@ -104,20 +104,20 @@ endfunction()
 
 # Utility function to link static/dynamic lib
 function(finufft_link_test target)
-    if(FINUFFT_STATIC_LINKING)
+    if (FINUFFT_STATIC_LINKING)
         target_link_libraries(${target} PRIVATE finufft_static)
-        if(FINUFFT_USE_OPENMP)
+        if (FINUFFT_USE_OPENMP)
             target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
-            if(WIN32)
+            if (WIN32)
                 target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
-            endif()
-        endif()
-    else()
+            endif ()
+        endif ()
+    else ()
         target_link_libraries(${target} PRIVATE finufft)
-        if(WIN32)
+        if (WIN32)
             target_compile_definitions(${target} PRIVATE FINUFFT_DLL)
-        endif()
-    endif()
+        endif ()
+    endif ()
     enable_asan(${target})
 endfunction()
 
@@ -140,9 +140,9 @@ function(set_finufft_options target)
         target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
         # there are issues on windows with OpenMP and CMake, so we need to manually add the flags
         # otherwise there are link errors
-        if(WIN32)
+        if (WIN32)
             target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
-        endif()
+        endif ()
     else ()
         if (CMAKE_CXX_COMPILER_ID IN_LIST FINUFFT_GNU_LIKE_COMPILERS)
             # OpenMP disabled, suppress unknown pragma warnings to avoid spam.
@@ -154,16 +154,16 @@ function(set_finufft_options target)
     # include them since we need them for build not for install
     # trying to include them directly into the fftw and fftwf targets causes issues with
     # the latest version of cmake, so we do it here instead.
-    if ( (NOT FFTW_FOUND ) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
-        list (GET FINUFFT_FFTW_LIBRARIES 0 element)
+    if ((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
+        list(GET FINUFFT_FFTW_LIBRARIES 0 element)
         get_property(FFTW_SOURCE_DIR TARGET ${element} PROPERTY SOURCE_DIR)
         set(FFTW_INCLUDE_DIR ${FFTW_SOURCE_DIR}/api)
         target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR})
-    endif()
+    endif ()
 
 endfunction()
 
-if(FINUFFT_USE_CPU)
+if (FINUFFT_USE_CPU)
     # Main finufft libraries
     add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
     target_compile_definitions(finufft_f32 PRIVATE SINGLE)
@@ -176,7 +176,7 @@ if(FINUFFT_USE_CPU)
     set_finufft_options(finufft_f64)
     target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES})
     target_link_libraries(finufft_f64 PRIVATE xsimd)
-    if(WIN32)
+    if (WIN32)
         add_library(finufft_f32_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
         target_compile_definitions(finufft_f32_dll PRIVATE SINGLE dll_EXPORTS FINUFFT_DLL)
         set_finufft_options(finufft_f32_dll)
@@ -186,43 +186,43 @@ if(FINUFFT_USE_CPU)
         target_compile_definitions(finufft_f64_dll PRIVATE dll_EXPORTS FINUFFT_DLL)
         set_finufft_options(finufft_f64_dll)
         target_link_libraries(finufft_f64_dll PUBLIC ${FINUFFT_FFTW_LIBRARIES})
-    endif()
+    endif ()
 
     add_library(finufft SHARED src/utils_precindep.cpp contrib/legendre_rule_fast.cpp)
     target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL)
     set_finufft_options(finufft)
-    if(NOT WIN32)
+    if (NOT WIN32)
         target_link_libraries(finufft PUBLIC finufft_f32 finufft_f64)
-    else()
+    else ()
         target_link_libraries(finufft PUBLIC finufft_f32_dll finufft_f64_dll)
-    endif()
+    endif ()
     # windows does not have a math library, so we need to exclude it
-    if(NOT WIN32)
+    if (NOT WIN32)
         target_link_libraries(finufft PUBLIC m)
-    endif()
+    endif ()
     target_include_directories(finufft PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
 
     add_library(finufft_static STATIC src/utils_precindep.cpp contrib/legendre_rule_fast.cpp)
     set_finufft_options(finufft_static)
     target_link_libraries(finufft_static PUBLIC finufft_f32 finufft_f64)
     # windows does not have a math library, so we need to exclude it
-    if(NOT WIN32)
+    if (NOT WIN32)
         target_link_libraries(finufft_static PUBLIC m)
-    endif()
+    endif ()
     target_include_directories(finufft_static PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
 
     file(GLOB FINUFFT_PUBLIC_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h")
     set_target_properties(finufft PROPERTIES PUBLIC_HEADER "${FINUFFT_PUBLIC_HEADERS}")
 
     list(APPEND INSTALL_TARGETS finufft finufft_static)
-endif()
+endif ()
 
-if(FINUFFT_USE_CUDA)
-    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+if (FINUFFT_USE_CUDA)
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
         message("FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to '60;70;75;'")
         message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
         set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE)
-    endif()
+    endif ()
     enable_language(CUDA)
     find_package(CUDAToolkit REQUIRED)
     add_subdirectory(src/cuda)
@@ -231,7 +231,7 @@ if(FINUFFT_USE_CUDA)
     endif ()
 
     list(APPEND INSTALL_TARGETS cufinufft cufinufft_static)
-endif()
+endif ()
 
 # Add tests defined in their own directory
 if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU)
@@ -278,7 +278,7 @@ if (FINUFFT_USE_CPU)
         install(FILES ${PROJECT_SOURCE_DIR}/include/finufft.fh
                 DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
         )
-    endif()
+    endif ()
 endif ()
 if (FINUFFT_USE_CUDA)
     install(DIRECTORY ${PROJECT_SOURCE_DIR}/examples/cuda
@@ -286,4 +286,4 @@ if (FINUFFT_USE_CUDA)
             PATTERN "README" EXCLUDE
             PATTERN "CMakeLists.txt" EXCLUDE
     )
-endif()
+endif ()
diff --git a/CMakePresets.json b/CMakePresets.json
index 0dcb3a5eb..b04204500 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,163 +1,164 @@
 {
-    "version": 2,
-    "cmakeMinimumRequired": {
-        "major": 3,
-        "minor": 19,
-        "patch": 0
+  "version": 2,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 19,
+    "patch": 0
+  },
+  "configurePresets": [
+    {
+      "name": "default",
+      "binaryDir": "build/default",
+      "displayName": "Default",
+      "description": "Default release configuration (ninja)",
+      "generator": "Ninja",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "RelWithDebInfo"
+      }
     },
-    "configurePresets": [
-        {
-            "name": "default",
-            "binaryDir": "build/default",
-            "displayName": "Default",
-            "description": "Default release configuration (ninja)",
-            "generator": "Ninja",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "RelWithDebInfo"
-            }
-        },
-        {
-            "name": "ninja-multi",
-            "binaryDir": "build/ninja",
-            "displayName": "Ninja Multi-config",
-            "description": "Multi-configuration build with ninja",
-            "generator": "Ninja Multi-Config"
-        },
-        {
-            "name": "dev",
-            "binaryDir": "build/dev",
-            "displayName": "Development",
-            "description": "Development configuration (full tests and examples)",
-            "generator": "Ninja Multi-Config",
-            "cacheVariables": {
-                "FINUFFT_BUILD_TESTS": "ON",
-                "FINUFFT_BUILD_EXAMPLES": "ON",
-                "FINUFFT_BUILD_DEVEL": "ON"
-            }
-        },
-        {
-            "name": "benchmark",
-            "binaryDir": "build/benchmark",
-            "displayName": "Benchmark",
-            "description": "Benchmark release configuration (ninja)",
-            "generator": "Ninja",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "RelWithDebInfo",
-                "FINUFFT_BUILD_TESTS": "ON",
-                "FINUFFT_BUILD_EXAMPLES": "ON",
-                "FINUFFT_FFTW_SUFFIX": "",
-                "FINUFFT_USE_OPENMP": "OFF"
-            }
-        },
-        {
-            "name": "manylinux",
-            "binaryDir": "build/manylinux",
-            "displayName": "manylinux",
-            "description": "Configuration for maximum binary compatibility",
-            "inherits": "default",
-            "cacheVariables": {
-                "FINUFFT_ARCH_FLAGS": "-march=x86-64 -mtune=generic -msse4"
-            }
-        },
-        {
-            "name": "singlethreaded",
-            "binaryDir": "build/singlethreaded",
-            "displayName": "singlethreaded",
-            "description": "Configuration for single-threaded build. Disables OpenMP for finufft and FFTW",
-            "inherits": "default",
-            "cacheVariables": {
-                "FINUFFT_FFTW_SUFFIX": "",
-                "FINUFFT_USE_OPENMP": "OFF"
-            }
-        },
-        {
-            "name": "icx",
-            "binaryDir": "build/icx",
-            "displayName": "Intel Compiler (llvm)",
-            "description": "Build with Intel Compiler",
-            "generator": "Ninja Multi-Config",
-            "cacheVariables": {
-                "CMAKE_C_COMPILER": "icx",
-                "CMAKE_CXX_COMPILER": "icpx",
-                "CMAKE_Fortran_COMPILER": "ifx",
-                "FINUFFT_ARCH_FLAGS": "-xHost",
-                "CMAKE_CXX_FLAGS": "-fp-model=strict"
-            }
-        },
-        {
-            "name": "icc",
-            "binaryDir": "build/icc",
-            "displayName": "Intel Compiler",
-            "description": "Build with Intel Compiler",
-            "generator": "Ninja Multi-Config",
-            "cacheVariables": {
-                "CMAKE_C_COMPILER": "icc",
-                "CMAKE_CXX_COMPILER": "icpc",
-                "CMAKE_Fortran_COMPILER": "ifort",
-                "FINUFFT_ARCH_FLAGS": "-xHost",
-                "CMAKE_CXX_FLAGS": "-fp-model=strict"
-            }
-        },
-        {
-            "name": "matlab",
-            "binaryDir": "build/matlab",
-            "displayName": "matlab",
-            "description": "Build with the matlab interface",
-            "generator": "Ninja Multi-Config",
-            "cacheVariables": {
-                "FINUFFT_FFTW_SUFFIX": "Threads",
-                "FINUFFT_BUILD_MATLAB": "ON",
-                "FINUFFT_ENABLE_SANITIZERS": "OFF"
-            }
-        }
-    ],
-    "buildPresets": [
-        {
-            "name": "default",
-            "configurePreset": "default"
-        },
-        {
-            "name": "dev",
-            "configurePreset": "dev",
-            "configuration": "RelWithDebInfo"
-        },
-        {
-            "name": "ninja-multi",
-            "configurePreset": "ninja-multi",
-            "configuration": "RelWithDebInfo"
-        },
-        {
-            "name": "manylinux",
-            "configurePreset": "manylinux"
-        },
-        {
-            "name": "singlethreaded",
-            "configurePreset": "singlethreaded"
-        },
-        {
-            "name": "icc",
-            "configurePreset": "icc",
-            "configuration": "RelWithDebInfo"
-        },
-        {
-            "name": "icx",
-            "configurePreset": "icx",
-            "configuration": "RelWithDebInfo"
-        },
-        {
-            "name": "matlab",
-            "configurePreset": "matlab",
-            "configuration": "Release"
-        }
-    ],
-    "testPresets": [
-        {
-            "name": "dev",
-            "configurePreset": "dev",
-            "configuration": "Debug",
-            "environment": {
-                "OMP_NUM_THREADS": "1"
-            }
-        }
-    ]
+    {
+      "name": "ninja-multi",
+      "binaryDir": "build/ninja",
+      "displayName": "Ninja Multi-config",
+      "description": "Multi-configuration build with ninja",
+      "generator": "Ninja Multi-Config"
+    },
+    {
+      "name": "dev",
+      "binaryDir": "build/dev",
+      "displayName": "Development",
+      "description": "Development configuration (full tests and examples)",
+      "generator": "Ninja Multi-Config",
+      "cacheVariables": {
+        "FINUFFT_BUILD_TESTS": "ON",
+        "FINUFFT_BUILD_EXAMPLES": "ON",
+        "FINUFFT_BUILD_DEVEL": "ON"
+      }
+    },
+    {
+      "name": "benchmark",
+      "binaryDir": "build/benchmark",
+      "displayName": "Benchmark",
+      "description": "Benchmark release configuration (ninja)",
+      "generator": "Ninja",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "RelWithDebInfo",
+        "FINUFFT_BUILD_TESTS": "ON",
+        "FINUFFT_BUILD_EXAMPLES": "ON",
+        "FINUFFT_FFTW_SUFFIX": "",
+        "FINUFFT_USE_OPENMP": "OFF"
+      }
+    },
+    {
+      "name": "manylinux",
+      "binaryDir": "build/manylinux",
+      "displayName": "manylinux",
+      "description": "Configuration for maximum binary compatibility",
+      "inherits": "default",
+      "cacheVariables": {
+        "FINUFFT_ARCH_FLAGS": "-march=x86-64 -mtune=generic -msse4"
+      }
+    },
+    {
+      "name": "singlethreaded",
+      "binaryDir": "build/singlethreaded",
+      "displayName": "singlethreaded",
+      "description":
+          "Configuration for single-threaded build. Disables OpenMP for finufft and FFTW",
+      "inherits": "default",
+      "cacheVariables": {
+        "FINUFFT_FFTW_SUFFIX": "",
+        "FINUFFT_USE_OPENMP": "OFF"
+      }
+    },
+    {
+      "name": "icx",
+      "binaryDir": "build/icx",
+      "displayName": "Intel Compiler (llvm)",
+      "description": "Build with Intel Compiler",
+      "generator": "Ninja Multi-Config",
+      "cacheVariables": {
+        "CMAKE_C_COMPILER": "icx",
+        "CMAKE_CXX_COMPILER": "icpx",
+        "CMAKE_Fortran_COMPILER": "ifx",
+        "FINUFFT_ARCH_FLAGS": "-xHost",
+        "CMAKE_CXX_FLAGS": "-fp-model=strict"
+      }
+    },
+    {
+      "name": "icc",
+      "binaryDir": "build/icc",
+      "displayName": "Intel Compiler",
+      "description": "Build with Intel Compiler",
+      "generator": "Ninja Multi-Config",
+      "cacheVariables": {
+        "CMAKE_C_COMPILER": "icc",
+        "CMAKE_CXX_COMPILER": "icpc",
+        "CMAKE_Fortran_COMPILER": "ifort",
+        "FINUFFT_ARCH_FLAGS": "-xHost",
+        "CMAKE_CXX_FLAGS": "-fp-model=strict"
+      }
+    },
+    {
+      "name": "matlab",
+      "binaryDir": "build/matlab",
+      "displayName": "matlab",
+      "description": "Build with the matlab interface",
+      "generator": "Ninja Multi-Config",
+      "cacheVariables": {
+        "FINUFFT_FFTW_SUFFIX": "Threads",
+        "FINUFFT_BUILD_MATLAB": "ON",
+        "FINUFFT_ENABLE_SANITIZERS": "OFF"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "default",
+      "configurePreset": "default"
+    },
+    {
+      "name": "dev",
+      "configurePreset": "dev",
+      "configuration": "RelWithDebInfo"
+    },
+    {
+      "name": "ninja-multi",
+      "configurePreset": "ninja-multi",
+      "configuration": "RelWithDebInfo"
+    },
+    {
+      "name": "manylinux",
+      "configurePreset": "manylinux"
+    },
+    {
+      "name": "singlethreaded",
+      "configurePreset": "singlethreaded"
+    },
+    {
+      "name": "icc",
+      "configurePreset": "icc",
+      "configuration": "RelWithDebInfo"
+    },
+    {
+      "name": "icx",
+      "configurePreset": "icx",
+      "configuration": "RelWithDebInfo"
+    },
+    {
+      "name": "matlab",
+      "configurePreset": "matlab",
+      "configuration": "Release"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "dev",
+      "configurePreset": "dev",
+      "configuration": "Debug",
+      "environment": {
+        "OMP_NUM_THREADS": "1"
+      }
+    }
+  ]
 }
diff --git a/contrib/legendre_rule_fast.cpp b/contrib/legendre_rule_fast.cpp
index 01b626cc3..a91119161 100644
--- a/contrib/legendre_rule_fast.cpp
+++ b/contrib/legendre_rule_fast.cpp
@@ -12,16 +12,16 @@
 #include <cstdlib>
 
 namespace finufft {
-  namespace quadrature {
-  
-void legendre_compute_glr ( int n, double x[], double w[] );
-void legendre_compute_glr0 ( int n, double *p, double *pp );
-void legendre_compute_glr1 ( int n, double *roots, double *ders );
-void legendre_compute_glr2 ( double p, int n, double *roots, double *ders );
-double rk2_leg ( double t, double tn, double x, int n );
-double ts_mult ( double *u, double h, int n );
-
-void legendre_compute_glr ( int n, double x[], double w[] )
+namespace quadrature {
+
+void legendre_compute_glr(int n, double x[], double w[]);
+void legendre_compute_glr0(int n, double *p, double *pp);
+void legendre_compute_glr1(int n, double *roots, double *ders);
+void legendre_compute_glr2(double p, int n, double *roots, double *ders);
+double rk2_leg(double t, double tn, double x, int n);
+double ts_mult(double *u, double h, int n);
+
+void legendre_compute_glr(int n, double x[], double w[])
 /******************************************************************************/
 /*
   Purpose:
@@ -30,7 +30,7 @@ void legendre_compute_glr ( int n, double x[], double w[] )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -43,8 +43,8 @@ void legendre_compute_glr ( int n, double x[], double w[] )
 
   Reference:
 
-    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, 
-    A fast algorithm for the calculation of the roots of special functions, 
+    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin,
+    A fast algorithm for the calculation of the roots of special functions,
     SIAM Journal on Scientific Computing,
     Volume 29, Number 4, pages 1420-1438, 2007.
 
@@ -61,47 +61,41 @@ void legendre_compute_glr ( int n, double x[], double w[] )
   double p;
   double pp;
   double w_sum;
-/*
-  Get the value and derivative of the N-th Legendre polynomial at 0.
-*/
-  legendre_compute_glr0 ( n, &p, &pp );
-/*
-  Either zero is a root, or we have to call a function to find the first root.
-*/  
-  if ( n % 2 == 1 )
-  {
-    x[(n-1)/2] = p;
-    w[(n-1)/2] = pp;
+  /*
+    Get the value and derivative of the N-th Legendre polynomial at 0.
+  */
+  legendre_compute_glr0(n, &p, &pp);
+  /*
+    Either zero is a root, or we have to call a function to find the first root.
+  */
+  if (n % 2 == 1) {
+    x[(n - 1) / 2] = p;
+    w[(n - 1) / 2] = pp;
+  } else {
+    legendre_compute_glr2(p, n, &x[n / 2], &w[n / 2]);
   }
-  else
-  {
-    legendre_compute_glr2 ( p, n, &x[n/2], &w[n/2] );
-  }
-/*
-  Get the complete set of roots and derivatives.
-*/
-  legendre_compute_glr1 ( n, x, w );
-/*
-  Compute the weights.
-*/
-  for ( i = 0; i < n; i++ )
-  {
-    w[i] = 2.0 / ( 1.0 - x[i] ) / ( 1.0 + x[i] ) / w[i] / w[i];
+  /*
+    Get the complete set of roots and derivatives.
+  */
+  legendre_compute_glr1(n, x, w);
+  /*
+    Compute the weights.
+  */
+  for (i = 0; i < n; i++) {
+    w[i] = 2.0 / (1.0 - x[i]) / (1.0 + x[i]) / w[i] / w[i];
   }
   w_sum = 0.0;
-  for ( i = 0; i < n; i++ )
-  {
+  for (i = 0; i < n; i++) {
     w_sum = w_sum + w[i];
   }
-  for ( i = 0; i < n; i++ )
-  {
+  for (i = 0; i < n; i++) {
     w[i] = 2.0 * w[i] / w_sum;
   }
   return;
 }
 /******************************************************************************/
 
-void legendre_compute_glr0 ( int n, double *p, double *pp )
+void legendre_compute_glr0(int n, double *p, double *pp)
 
 /******************************************************************************/
 /*
@@ -111,7 +105,7 @@ void legendre_compute_glr0 ( int n, double *p, double *pp )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -124,8 +118,8 @@ void legendre_compute_glr0 ( int n, double *p, double *pp )
 
   Reference:
 
-    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, 
-    A fast algorithm for the calculation of the roots of special functions, 
+    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin,
+    A fast algorithm for the calculation of the roots of special functions,
     SIAM Journal on Scientific Computing,
     Volume 29, Number 4, pages 1420-1438, 2007.
 
@@ -144,18 +138,17 @@ void legendre_compute_glr0 ( int n, double *p, double *pp )
   double ppm1;
   double ppm2;
 
-  pm2 = 0.0;
-  pm1 = 1.0;
+  pm2  = 0.0;
+  pm1  = 1.0;
   ppm2 = 0.0;
   ppm1 = 0.0;
 
-  for ( k = 0; k < n; k++ )
-  {
-    dk = ( double ) k;
-    *p = - dk * pm2 / ( dk + 1.0 );
-    *pp = ( ( 2.0 * dk + 1.0 ) * pm1 - dk * ppm2 ) / ( dk + 1.0 );
-    pm2 = pm1;
-    pm1 = *p;
+  for (k = 0; k < n; k++) {
+    dk   = (double)k;
+    *p   = -dk * pm2 / (dk + 1.0);
+    *pp  = ((2.0 * dk + 1.0) * pm1 - dk * ppm2) / (dk + 1.0);
+    pm2  = pm1;
+    pm1  = *p;
     ppm2 = ppm1;
     ppm1 = *pp;
   }
@@ -163,7 +156,7 @@ void legendre_compute_glr0 ( int n, double *p, double *pp )
 }
 /******************************************************************************/
 
-void legendre_compute_glr1 ( int n, double *x, double *ders )
+void legendre_compute_glr1(int n, double *x, double *ders)
 
 /******************************************************************************/
 /*
@@ -179,7 +172,7 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -192,8 +185,8 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
 
   Reference:
 
-    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, 
-    A fast algorithm for the calculation of the roots of special functions, 
+    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin,
+    A fast algorithm for the calculation of the roots of special functions,
     SIAM Journal on Scientific Computing,
     Volume 29, Number 4, pages 1420-1438, 2007.
 
@@ -202,11 +195,11 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
     Input, int N, the order of the Legendre polynomial.
 
     Input/output, double X[N].  On input, a starting value
-    has been set in one entry.  On output, the roots of the Legendre 
+    has been set in one entry.  On output, the roots of the Legendre
     polynomial.
 
     Input/output, double DERS[N].  On input, a starting value
-    has been set in one entry.  On output, the derivatives of the Legendre 
+    has been set in one entry.  On output, the derivatives of the Legendre
     polynomial at the zeros.
 
   Local Parameters:
@@ -228,27 +221,23 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
   double *up;
   double xp;
 
-  if ( n % 2 == 1 )
-  {
-    n2 = ( n - 1 ) / 2;
-    s = 1;
-  }
-  else
-  {
+  if (n % 2 == 1) {
+    n2 = (n - 1) / 2;
+    s  = 1;
+  } else {
     n2 = n / 2;
-    s = 0;
+    s  = 0;
   }
 
-  u = ( double * ) malloc ( ( m + 2 ) * sizeof ( double ) );
-  up = ( double * ) malloc ( ( m + 1 ) * sizeof ( double ) );
+  u  = (double *)malloc((m + 2) * sizeof(double));
+  up = (double *)malloc((m + 1) * sizeof(double));
 
-  dn = ( double ) n;
+  dn = (double)n;
 
-  for ( j = n2; j < n - 1; j++ )
-  {
+  for (j = n2; j < n - 1; j++) {
     xp = x[j];
 
-    h = rk2_leg ( pi/2.0, -pi/2.0, xp, n ) - xp;
+    h = rk2_leg(pi / 2.0, -pi / 2.0, xp, n) - xp;
 
     u[0] = 0.0;
     u[1] = 0.0;
@@ -257,41 +246,36 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
     up[0] = 0.0;
     up[1] = u[2];
 
-    for ( k = 0; k <= m - 2; k++ )
-    {
-      dk = ( double ) k;
+    for (k = 0; k <= m - 2; k++) {
+      dk = (double)k;
 
-      u[k+3] = 
-      ( 
-        2.0 * xp * ( dk + 1.0 ) * u[k+2]
-        + ( dk * ( dk + 1.0 ) - dn * ( dn + 1.0 ) ) * u[k+1] / ( dk + 1.0 )
-      ) / ( 1.0 - xp ) / ( 1.0 + xp ) / ( dk + 2.0 );
+      u[k + 3] = (2.0 * xp * (dk + 1.0) * u[k + 2] +
+                  (dk * (dk + 1.0) - dn * (dn + 1.0)) * u[k + 1] / (dk + 1.0)) /
+                 (1.0 - xp) / (1.0 + xp) / (dk + 2.0);
 
-      up[k+2] = ( dk + 2.0 ) * u[k+3];
+      up[k + 2] = (dk + 2.0) * u[k + 3];
     }
 
-    for ( l = 0; l < 5; l++ )
-    { 
-      h = h - ts_mult ( u, h, m ) / ts_mult ( up, h, m-1 );
+    for (l = 0; l < 5; l++) {
+      h = h - ts_mult(u, h, m) / ts_mult(up, h, m - 1);
     }
 
-    x[j+1] = xp + h;
-    ders[j+1] = ts_mult ( up, h, m-1 );
+    x[j + 1]    = xp + h;
+    ders[j + 1] = ts_mult(up, h, m - 1);
   }
 
-  free ( u );
-  free ( up );
+  free(u);
+  free(up);
 
-  for ( k = 0; k < n2 + s; k++ )
-  {
-    x[k] = - x[n-k-1];
-    ders[k] = ders[n-k-1];
+  for (k = 0; k < n2 + s; k++) {
+    x[k]    = -x[n - k - 1];
+    ders[k] = ders[n - k - 1];
   }
   return;
 }
 /******************************************************************************/
 
-void legendre_compute_glr2 ( double pn0, int n, double *x1,  double *d1 )
+void legendre_compute_glr2(double pn0, int n, double *x1, double *d1)
 
 /******************************************************************************/
 /*
@@ -308,7 +292,7 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1,  double *d1 )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -321,8 +305,8 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1,  double *d1 )
 
   Reference:
 
-    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, 
-    A fast algorithm for the calculation of the roots of special functions, 
+    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin,
+    A fast algorithm for the calculation of the roots of special functions,
     SIAM Journal on Scientific Computing,
     Volume 29, Number 4, pages 1420-1438, 2007.
 
@@ -345,55 +329,52 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1,  double *d1 )
   double dn;
   int k;
   int l;
-  int m = 30;
+  int m           = 30;
   const double pi = 3.141592653589793;
   double t;
   double *u;
   double *up;
 
-  t = 0.0;
-  *x1 = rk2_leg ( t, -pi/2.0, 0.0, n );
+  t   = 0.0;
+  *x1 = rk2_leg(t, -pi / 2.0, 0.0, n);
 
-  u = ( double * ) malloc ( ( m + 2 ) * sizeof ( double ) );
-  up = ( double * ) malloc ( ( m + 1 ) * sizeof ( double ) );
+  u  = (double *)malloc((m + 2) * sizeof(double));
+  up = (double *)malloc((m + 1) * sizeof(double));
 
-  dn = ( double ) n;
-/*
-  U[0] and UP[0] are never used.
-  U[M+1] is set, but not used, and UP[M] is set and not used.
-  What gives?
-*/
+  dn = (double)n;
+  /*
+    U[0] and UP[0] are never used.
+    U[M+1] is set, but not used, and UP[M] is set and not used.
+    What gives?
+  */
   u[0] = 0.0;
   u[1] = pn0;
 
   up[0] = 0.0;
- 
-  for ( k = 0; k <= m - 2; k = k + 2 )
-  {
-    dk = ( double ) k;
-
-    u[k+2] = 0.0;
-    u[k+3] = ( dk * ( dk + 1.0 ) - dn * ( dn + 1.0 ) ) * u[k+1]
-      / ( dk + 1.0 ) / ( dk + 2.0 );
- 
-    up[k+1] = 0.0;
-    up[k+2] = ( dk + 2.0 ) * u[k+3];
+
+  for (k = 0; k <= m - 2; k = k + 2) {
+    dk = (double)k;
+
+    u[k + 2] = 0.0;
+    u[k + 3] = (dk * (dk + 1.0) - dn * (dn + 1.0)) * u[k + 1] / (dk + 1.0) / (dk + 2.0);
+
+    up[k + 1] = 0.0;
+    up[k + 2] = (dk + 2.0) * u[k + 3];
   }
-  
-  for ( l = 0; l < 5; l++ )
-  {
-    *x1 = *x1 - ts_mult ( u, *x1, m ) / ts_mult ( up, *x1, m-1 );
+
+  for (l = 0; l < 5; l++) {
+    *x1 = *x1 - ts_mult(u, *x1, m) / ts_mult(up, *x1, m - 1);
   }
-  *d1 = ts_mult ( up, *x1, m-1 );
+  *d1 = ts_mult(up, *x1, m - 1);
 
-  free ( u );
-  free ( up) ;
+  free(u);
+  free(up);
 
   return;
 }
 /******************************************************************************/
 
-double rk2_leg ( double t1, double t2, double x, int n )
+double rk2_leg(double t1, double t2, double x, int n)
 
 /******************************************************************************/
 /*
@@ -403,7 +384,7 @@ double rk2_leg ( double t1, double t2, double x, int n )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -434,29 +415,27 @@ double rk2_leg ( double t1, double t2, double x, int n )
   double snn1;
   double t;
 
-  h = ( t2 - t1 ) / ( double ) m;
-  snn1 = sqrt ( ( double ) ( n * ( n + 1 ) ) );
+  h    = (t2 - t1) / (double)m;
+  snn1 = sqrt((double)(n * (n + 1)));
 
   t = t1;
 
-  for ( j = 0; j < m; j++ )
-  {
-    f = ( 1.0 - x ) * ( 1.0 + x );
-    k1 = - h * f / ( snn1 * sqrt ( f ) - 0.5 * x * sin ( 2.0 * t ) );
-    x = x + k1;
+  for (j = 0; j < m; j++) {
+    f  = (1.0 - x) * (1.0 + x);
+    k1 = -h * f / (snn1 * sqrt(f) - 0.5 * x * sin(2.0 * t));
+    x  = x + k1;
 
     t = t + h;
 
-    f = ( 1.0 - x ) * ( 1.0 + x );
-    k2 = - h * f / ( snn1 * sqrt ( f ) - 0.5 * x * sin ( 2.0 * t ) );   
-    x = x + 0.5 * ( k2 - k1 );
+    f  = (1.0 - x) * (1.0 + x);
+    k2 = -h * f / (snn1 * sqrt(f) - 0.5 * x * sin(2.0 * t));
+    x  = x + 0.5 * (k2 - k1);
   }
   return x;
 }
 /******************************************************************************/
 
-
-double ts_mult ( double *u, double h, int n )
+double ts_mult(double *u, double h, int n)
 
 /******************************************************************************/
 /*
@@ -470,7 +449,7 @@ double ts_mult ( double *u, double h, int n )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -496,11 +475,10 @@ double ts_mult ( double *u, double h, int n )
   double hk;
   int k;
   double ts;
-  
+
   ts = 0.0;
   hk = 1.0;
-  for ( k = 1; k<= n; k++ )
-  {
+  for (k = 1; k <= n; k++) {
     ts = ts + u[k] * hk;
     hk = hk * h;
   }
@@ -508,5 +486,5 @@ double ts_mult ( double *u, double h, int n )
 }
 /******************************************************************************/
 
-  } // namespace
-} // namespace
+} // namespace quadrature
+} // namespace finufft
diff --git a/contrib/legendre_rule_fast.h b/contrib/legendre_rule_fast.h
index 49c5bcf13..357909f9e 100644
--- a/contrib/legendre_rule_fast.h
+++ b/contrib/legendre_rule_fast.h
@@ -2,9 +2,9 @@
 #define GAUSSQUAD_H
 
 namespace finufft {
-  namespace quadrature {
-  void legendre_compute_glr ( int n, double x[], double w[] );
-  }  // namespace
-}  // namespace
+namespace quadrature {
+void legendre_compute_glr(int n, double x[], double w[]);
+} // namespace quadrature
+} // namespace finufft
 
 #endif
diff --git a/devel/eval_ker_expts.cpp b/devel/eval_ker_expts.cpp
index 015bb8a38..8da4a1699 100644
--- a/devel/eval_ker_expts.cpp
+++ b/devel/eval_ker_expts.cpp
@@ -3,22 +3,25 @@
 
    compile with:
 
-g++ eval_ker_expts.cpp -o eval_ker_expts -Ofast -funroll-loops -march=native; time ./eval_ker_expts
+g++ eval_ker_expts.cpp -o eval_ker_expts -Ofast -funroll-loops -march=native; time
+./eval_ker_expts
 
    Barnett 3/28/18 for JD Patel (Intel).
    Single-prec version also of interest, if faster.
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
 
 // Choose prec...
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N)
+static inline void evaluate_kernel_vector(FLT *__restrict__ ker,
+                                          const FLT *__restrict__ args, const FLT beta,
+                                          const FLT c, const int N)
 /* Evaluate kernel for a vector of N arguments.
    Can comment out either or both loops.
    The #pragra's need to be removed for icpc if -fopenmp not used.
@@ -26,33 +29,31 @@ static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __
 {
 #pragma omp simd
   for (int i = 0; i < N; i++) // Loop 1: Compute exponential arguments
-    ker[i] = beta * sqrt(1.0 - c*args[i]*args[i]);
-  //ker[i] = beta * (1.0 - c*args[i]*args[i]);   // no-sqrt version
-  
+    ker[i] = beta * sqrt(1.0 - c * args[i] * args[i]);
+    // ker[i] = beta * (1.0 - c*args[i]*args[i]);   // no-sqrt version
+
 #pragma omp simd
   for (int i = 0; i < N; i++) // Loop 2: Compute exponentials
     ker[i] = exp(ker[i]);
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;                // # of reps
-  int w=10;                         // spread width (small), needn't be mult of 4
-  FLT beta=2.3*w, c = 4.0/(w*w); // ker params
-  FLT iw = 1.0/(FLT)w;
-  FLT ans = 0.0;                 // dummy answer
+int main(int argc, char *argv[]) {
+  int M    = (int)1e7;                   // # of reps
+  int w    = 10;                         // spread width (small), needn't be mult of 4
+  FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params
+  FLT iw  = 1.0 / (FLT)w;
+  FLT ans = 0.0;                         // dummy answer
   std::vector<FLT> x(w);
   std::vector<FLT> f(w);
-  for (int i=1;i<M;++i) {
-    FLT xi = i/(FLT)M;        // dummy offset to make each rep different
-    for (int j=0;j<w;++j)           // fill a simple argument vector (cheap)
-      x[j] = -1.0 + xi + iw*j;      // note each x in [-1,1]
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j)
-      ans += f[j];                  // do something cheap to use f output
+  for (int i = 1; i < M; ++i) {
+    FLT xi = i / (FLT)M;         // dummy offset to make each rep different
+    for (int j = 0; j < w; ++j)  // fill a simple argument vector (cheap)
+      x[j] = -1.0 + xi + iw * j; // note each x in [-1,1]
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) ans += f[j]; // do something cheap to use f output
     // we don't do anything with f, but compiler hasn't figured this out :)
   }
-  printf("ans=%.15g\n",ans);
+  printf("ans=%.15g\n", ans);
   return 0;
 }
 
diff --git a/devel/eval_ker_expts2.cpp b/devel/eval_ker_expts2.cpp
index bd415e8fd..8df8ed76b 100644
--- a/devel/eval_ker_expts2.cpp
+++ b/devel/eval_ker_expts2.cpp
@@ -1,60 +1,58 @@
 /* exponential sqrt kernel eval speed tester, single-thread, trying openmp simd.
    compile with:
 
-g++-7 eval_ker_expts2.cpp -o eval_ker_expts2 -Ofast -march=native -fopt-info; time ./eval_ker_expts2 10000000
+g++-7 eval_ker_expts2.cpp -o eval_ker_expts2 -Ofast -march=native -fopt-info; time
+./eval_ker_expts2 10000000
 
 Barnett 4/23/18. See below for concls.
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
 
 // Choose prec...
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT* ker, const FLT* args, const FLT beta, const FLT c, const int N)
+static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta,
+                                          const FLT c, const int N)
 /* Evaluate kernel for a vector of N arguments.
    The #pragmas need to be removed for icpc if -fopenmp not used.
 For g++-7, this pragma (with -fopenmp) slows it down from 0.2s to 0.4s!
 THe __restrict__ on the I/O args don't matter.
  */
 {
-  //#pragma omp simd
-  for (int i = 0; i < N; i++)
-    ker[i] = exp(beta * sqrt(FLT(1.0) - c*args[i]*args[i]));
+  // #pragma omp simd
+  for (int i = 0; i < N; i++) ker[i] = exp(beta * sqrt(FLT(1.0) - c * args[i] * args[i]));
   // FLT(1.0) suggested by mreineck
 
- // slows down from 0.2s to 2.0s for w=12, unless it's at 0.4s when no effect...
-  //  for (int i = 0; i < N; i++)         
-	//   if (fabs(args[i]) >= (FLT)N/2)    // note fabs not abs!
-	       // ker[i] = 0.0;
+  // slows down from 0.2s to 2.0s for w=12, unless it's at 0.4s when no effect...
+  //  for (int i = 0; i < N; i++)
+  //   if (fabs(args[i]) >= (FLT)N/2)    // note fabs not abs!
+  // ker[i] = 0.0;
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;          // # of reps
-  if (argc>1)
-    sscanf(argv[1],"%d",&M);  // find not needed to get the 0.2 s time.
-  int w=11;        // spread width: 10 0.17s, 11 1.8s, 12 0.2s, 13 2.0s, 15 2.5s
-  //if (argc>2)                 // even including this code slows to 0.4s !!
-  //sscanf(argv[2],"%d",&w);       //  .. but speeds up w=13 from 2s to 0.4s !
-  FLT beta=2.3*w, c = 4.0/(w*w); // typ ker params
-  FLT iw = 1.0/(FLT)w;
-  FLT ans = 0.0;                 // dummy answer
+int main(int argc, char *argv[]) {
+  int M = (int)1e7;                        // # of reps
+  if (argc > 1) sscanf(argv[1], "%d", &M); // find not needed to get the 0.2 s time.
+  int w = 11; // spread width: 10 0.17s, 11 1.8s, 12 0.2s, 13 2.0s, 15 2.5s
+  // if (argc>2)                 // even including this code slows to 0.4s !!
+  // sscanf(argv[2],"%d",&w);       //  .. but speeds up w=13 from 2s to 0.4s !
+  FLT beta = 2.3 * w, c = 4.0 / (w * w); // typ ker params
+  FLT iw  = 1.0 / (FLT)w;
+  FLT ans = 0.0;                         // dummy answer
   std::vector<FLT> x(w);
   std::vector<FLT> f(w);
-  for (int i=1;i<=M;++i) { // i=0 to M-1 : 2.1s;  i=1 to M : 0.2s !!!!!
-    FLT xi = -w/(FLT)2.0 + i/(FLT)M;  // dummy offset to make each rep different
-    for (int j=0;j<w;++j)           // fill a simple argument vector (cheap)
-      x[j] = xi + (FLT)j;      // note each x in [-w/2,w/2]
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j)
-      ans += f[j];                  // do something cheap to use f output
+  for (int i = 1; i <= M; ++i) {         // i=0 to M-1 : 2.1s;  i=1 to M : 0.2s !!!!!
+    FLT xi = -w / (FLT)2.0 + i / (FLT)M; // dummy offset to make each rep different
+    for (int j = 0; j < w; ++j)          // fill a simple argument vector (cheap)
+      x[j] = xi + (FLT)j;                // note each x in [-w/2,w/2]
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) ans += f[j]; // do something cheap to use f output
   }
-  printf("ans=%.15g\n",ans);
+  printf("ans=%.15g\n", ans);
   return 0;
 }
 
diff --git a/devel/eval_ker_expts_libin_simd64.cpp b/devel/eval_ker_expts_libin_simd64.cpp
index 7f4cc3b8f..5bc78c194 100644
--- a/devel/eval_ker_expts_libin_simd64.cpp
+++ b/devel/eval_ker_expts_libin_simd64.cpp
@@ -5,7 +5,8 @@ Libin Lu modified. - but need ICC to see fastest version.
 
    compile with:
 
-g++-7 eval_ker_expts_libin_simd64.cpp -o eval_ker_expts_libin_simd64 -Ofast -funroll-loops -march=native -fopt-info -fopt-info-vec-missed; time ./eval_ker_expts_libin_simd64
+g++-7 eval_ker_expts_libin_simd64.cpp -o eval_ker_expts_libin_simd64 -Ofast -funroll-loops
+-march=native -fopt-info -fopt-info-vec-missed; time ./eval_ker_expts_libin_simd64
 
 Ludvig's tweak of eval_ker_expts, 3/29/18.  Can get <0.2s for M=1e7, w=12.
 Note that the range of arguments is wrong [-1,1] not [-w/2,w/2].
@@ -18,11 +19,10 @@ that correlates w/ 0.2s magic.
 
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
-
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
 
 #ifdef VCL
 // Use Agner Fog's vector class library
@@ -33,65 +33,66 @@ that correlates w/ 0.2s magic.
 
 // Choose prec...
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N)
+static inline void evaluate_kernel_vector(FLT *__restrict__ ker,
+                                          const FLT *__restrict__ args, const FLT beta,
+                                          const FLT c, const int N)
 /* Evaluate kernel for a vector of N arguments.
-*/
+ */
 {
-#ifdef VCL 
-  for (int i = 0; i < N; i+=4) // Assume w divisible by 4
+#ifdef VCL
+  for (int i = 0; i < N; i += 4) // Assume w divisible by 4
   {
     Vec4d vec;
     vec.load(args + i);
-    vec = exp(beta*sqrt(1.0 - c*vec*vec));
+    vec = exp(beta * sqrt(1.0 - c * vec * vec));
     vec.store(ker + i);
-  }  
+  }
 #else
   for (int i = 0; i < N; i++) // Straight computation, note no pragma omp simd
-    ker[i] = exp(beta * sqrt(1.0 - c*args[i]*args[i]));
+    ker[i] = exp(beta * sqrt(1.0 - c * args[i] * args[i]));
 #endif
-  
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;                // # of reps
-  int w=12;                         // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only 0.2s, in g++-7. But not in gcc 5.4.0
+int main(int argc, char *argv[]) {
+  int M = (int)1e7; // # of reps
+  int w = 12; // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only
+              // 0.2s, in g++-7. But not in gcc 5.4.0
 
-  if (1) {   // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb
-  if (argc == 3)
-  {
-    sscanf(argv[1],"%d",&M);
-    //sscanf(argv[2],"%d",&w);  // slows down from 0.2s to 0.44s if use - why??
-  }
+  if (1) {    // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb
+    if (argc == 3) {
+      sscanf(argv[1], "%d", &M);
+      // sscanf(argv[2],"%d",&w);  // slows down from 0.2s to 0.44s if use - why??
+    }
   }
-  
-  
-  FLT beta=2.3*w, c = 4.0/(w*w); // ker params
-  FLT iw = 1.0/(FLT)w;
-  FLT ans = 0.0;                 // dummy answer
+
+  FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params
+  FLT iw  = 1.0 / (FLT)w;
+  FLT ans = 0.0;                         // dummy answer
   std::vector<FLT> x(w);
   std::vector<FLT> f(w);
   FLT xi;
   FLT tmp_val;
-  
+
 #pragma omp simd simdlen(64)
   // this pragma makes no difference on modern gcc.
-  for (int i=1;i<=M;++i) {  // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I don't understand - has to be a better way to control (assembly code?)
-    xi = i/(FLT)M;        // dummy offset to make each rep different
+  for (int i = 1; i <= M; ++i) { // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I
+                                 // don't understand - has to be a better way to control
+                                 // (assembly code?)
+    xi = i / (FLT)M;             // dummy offset to make each rep different
     /*
     for (int j=0;j<w;++j)           // fill a simple argument vector (cheap)
       x[j] = -1.0 + xi + iw*j;      // note each x in [-1,1]
     evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
     */
-    for (int j=0;j<w;++j){
-      tmp_val = -1.0 + xi + iw*j;
-      ans += exp(beta * sqrt(1.0 - c*tmp_val*tmp_val));
-      //ans += f[j];                  // do something cheap to use f output
+    for (int j = 0; j < w; ++j) {
+      tmp_val = -1.0 + xi + iw * j;
+      ans += exp(beta * sqrt(1.0 - c * tmp_val * tmp_val));
+      // ans += f[j];                  // do something cheap to use f output
     }
     // we don't do anything with f, but compiler hasn't figured this out :)
   }
-  printf("ans=%.15g\n",ans);
+  printf("ans=%.15g\n", ans);
   return 0;
 }
diff --git a/devel/eval_ker_expts_ludvig.cpp b/devel/eval_ker_expts_ludvig.cpp
index 2094afe5f..55d166a70 100644
--- a/devel/eval_ker_expts_ludvig.cpp
+++ b/devel/eval_ker_expts_ludvig.cpp
@@ -3,7 +3,8 @@
 
    compile with:
 
-g++-7 eval_ker_expts_ludvig.cpp -o eval_ker_expts_ludvig -Ofast -funroll-loops -march=native -fopt-info; time ./eval_ker_expts_ludvig
+g++-7 eval_ker_expts_ludvig.cpp -o eval_ker_expts_ludvig -Ofast -funroll-loops
+-march=native -fopt-info; time ./eval_ker_expts_ludvig
 
 Update: (8/8/19)
 g++-8 is less brittle - it is able to get 0.2 s runtime for i=1 or 0 start.
@@ -18,11 +19,10 @@ eval_ker_expts_ludvig.cpp:69:17: note: loop vectorized
 that correlates w/ 0.2s magic.
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
-
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
 
 #ifdef VCL
 // Use Agner Fog's vector class library
@@ -33,56 +33,56 @@ that correlates w/ 0.2s magic.
 
 // Choose prec...
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N)
+static inline void evaluate_kernel_vector(FLT *__restrict__ ker,
+                                          const FLT *__restrict__ args, const FLT beta,
+                                          const FLT c, const int N)
 /* Evaluate kernel for a vector of N arguments.
-*/
+ */
 {
-#ifdef VCL 
-  for (int i = 0; i < N; i+=4) // Assume w divisible by 4
+#ifdef VCL
+  for (int i = 0; i < N; i += 4) // Assume w divisible by 4
   {
     Vec4d vec;
     vec.load(args + i);
-    vec = exp(beta*sqrt(1.0 - c*vec*vec));
+    vec = exp(beta * sqrt(1.0 - c * vec * vec));
     vec.store(ker + i);
-  }  
+  }
 #else
   for (int i = 0; i < N; i++) // Straight computation, note no pragma omp simd
-    ker[i] = exp(beta * sqrt(1.0 - c*args[i]*args[i]));
+    ker[i] = exp(beta * sqrt(1.0 - c * args[i] * args[i]));
 #endif
-  
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;                // # of reps
-  int w=12;                         // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only 0.2s, in g++-7. But not in gcc 5.4.0
+int main(int argc, char *argv[]) {
+  int M = (int)1e7; // # of reps
+  int w = 12; // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only
+              // 0.2s, in g++-7. But not in gcc 5.4.0
 
-  if (1) {   // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb
-  if (argc == 3)
-  {
-    sscanf(argv[1],"%d",&M);
-    //sscanf(argv[2],"%d",&w);  // slows down from 0.2s to 0.44s if use - why??
+  if (1) {    // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb
+    if (argc == 3) {
+      sscanf(argv[1], "%d", &M);
+      // sscanf(argv[2],"%d",&w);  // slows down from 0.2s to 0.44s if use - why??
+    }
   }
-  }
-  
-  
-  FLT beta=2.3*w, c = 4.0/(w*w); // ker params
-  FLT iw = 1.0/(FLT)w;
-  FLT ans = 0.0;                 // dummy answer
+
+  FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params
+  FLT iw  = 1.0 / (FLT)w;
+  FLT ans = 0.0;                         // dummy answer
   std::vector<FLT> x(w);
   std::vector<FLT> f(w);
-  
-  for (int i=1;i<=M;++i) {  // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I don't understand - has to be a better way to control (assembly code?)
-    FLT xi = i/(FLT)M;        // dummy offset to make each rep different
-    for (int j=0;j<w;++j)           // fill a simple argument vector (cheap)
-      x[j] = -1.0 + xi + iw*j;      // note each x in [-1,1]
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j)
-      ans += f[j];                  // do something cheap to use f output
+
+  for (int i = 1; i <= M; ++i) { // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I
+                                 // don't understand - has to be a better way to control
+                                 // (assembly code?)
+    FLT xi = i / (FLT)M;         // dummy offset to make each rep different
+    for (int j = 0; j < w; ++j)  // fill a simple argument vector (cheap)
+      x[j] = -1.0 + xi + iw * j; // note each x in [-1,1]
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) ans += f[j]; // do something cheap to use f output
     // we don't do anything with f, but compiler hasn't figured this out :)
   }
-  printf("ans=%.15g\n",ans);
+  printf("ans=%.15g\n", ans);
   return 0;
 }
diff --git a/devel/foldrescale.cpp b/devel/foldrescale.cpp
index 1bcaef16d..63f994979 100644
--- a/devel/foldrescale.cpp
+++ b/devel/foldrescale.cpp
@@ -1,12 +1,12 @@
 #include "finufft/defs.h"
 #include <benchmark/benchmark.h>
+#include <cmath>
 #include <iostream>
 #include <random>
-#include <cmath>
 #include <xsimd/xsimd.hpp>
 
 // no vectorize
-//#pragma GCC optimize("no-tree-vectorize")
+// #pragma GCC optimize("no-tree-vectorize")
 /* local NU coord fold+rescale macro: does the following affine transform to x:
      when p=true:   map [-3pi,-pi) and [-pi,pi) and [pi,3pi)    each to [0,N)
      otherwise,     map [-N,0) and [0,N) and [N,2N)             each to [0,N)
@@ -17,63 +17,58 @@
    The macro wins hands-down on i7, even for modern GCC9.
    This should be done in C++ not as a macro, someday.
 */
-#define FOLDRESCALE(x, N, p) (p ?                                         \
-         (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI)) * ((FLT)M_1_2PI*N) : \
-                        (x>=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N))
-
+#define FOLDRESCALE(x, N, p)                                                \
+  (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) \
+     : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N))
 
-#define FOLDRESCALE04(x, N, p) (p ? \
-   ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5)))  * FLT(N) : \
-    ((x/FLT(N))-floor(x/FLT(N)))*FLT(N))
+#define FOLDRESCALE04(x, N, p)                                                       \
+  (p ? ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) * FLT(N) \
+     : ((x / FLT(N)) - floor(x / FLT(N))) * FLT(N))
 
-#define FOLDRESCALE05(x, N, p) FLT(N) * (p ? \
-   ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) : \
-    ((x/FLT(N))-floor(x/FLT(N))))
+#define FOLDRESCALE05(x, N, p)                                                       \
+  FLT(N) * (p ? ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) \
+              : ((x / FLT(N)) - floor(x / FLT(N))))
 
-inline __attribute__((always_inline))
-FLT foldRescale00(FLT x, BIGINT N, bool p) {
+inline __attribute__((always_inline)) FLT foldRescale00(FLT x, BIGINT N, bool p) {
   FLT result;
   FLT fN = FLT(N);
   if (p) {
     static constexpr FLT x2pi = FLT(M_1_2PI);
-    result = x * x2pi + FLT(0.5);
+    result                    = x * x2pi + FLT(0.5);
     result -= floor(result);
   } else {
     const FLT invN = FLT(1.0) / fN;
-    result = x * invN;
+    result         = x * invN;
     result -= floor(result);
   }
   return result * fN;
 }
 
-inline __attribute__((always_inline))
-FLT foldRescale01(FLT x, BIGINT N, bool p) {
-  return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N) :
-         (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
+inline __attribute__((always_inline)) FLT foldRescale01(FLT x, BIGINT N, bool p) {
+  return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N)
+           : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N);
 }
 
 template<bool p>
-inline __attribute__((always_inline))
-FLT foldRescale02(FLT x, BIGINT N) {
+inline __attribute__((always_inline)) FLT foldRescale02(FLT x, BIGINT N) {
   if constexpr (p) {
-    return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N);
+    return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N);
   } else {
-    return (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
+    return (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N);
   }
 }
 
 template<bool p>
-inline __attribute__((always_inline))
-FLT foldRescale03(FLT x, BIGINT N) {
+inline __attribute__((always_inline)) FLT foldRescale03(FLT x, BIGINT N) {
   FLT result;
   FLT fN = FLT(N);
   if constexpr (p) {
     static constexpr FLT x2pi = FLT(M_1_2PI);
-    result = std::fma(x, x2pi, FLT(0.5));
+    result                    = std::fma(x, x2pi, FLT(0.5));
     result -= floor(result);
   } else {
     const FLT invN = FLT(1.0) / fN;
-    result = x * invN;
+    result         = x * invN;
     result -= floor(result);
   }
   return result * fN;
@@ -81,10 +76,10 @@ FLT foldRescale03(FLT x, BIGINT N) {
 
 xsimd::batch<FLT> fold_rescale_vec(xsimd::batch<FLT> x, BIGINT N) {
   xsimd::batch<FLT> result;
-  const xsimd::batch<FLT> fN = xsimd::batch<FLT>(FLT(N));
+  const xsimd::batch<FLT> fN          = xsimd::batch<FLT>(FLT(N));
   static const xsimd::batch<FLT> x2pi = xsimd::batch<FLT>(FLT(M_1_2PI));
   static const xsimd::batch<FLT> half = xsimd::batch<FLT>(FLT(0.5));
-  result = xsimd::fma(x, x2pi, half);
+  result                              = xsimd::fma(x, x2pi, half);
   result -= xsimd::floor(result);
   return result * fN;
 }
@@ -92,122 +87,116 @@ xsimd::batch<FLT> fold_rescale_vec(xsimd::batch<FLT> x, BIGINT N) {
 static std::mt19937_64 gen;
 static std::uniform_real_distribution<> dis(-10, 10);
 static const auto N = std::uniform_int_distribution<>{0, 1000}(gen);
-static std::uniform_real_distribution<> disN(-N, 2*N);
-static volatile auto pirange = true;
+static std::uniform_real_distribution<> disN(-N, 2 * N);
+static volatile auto pirange    = true;
 static volatile auto notPirange = !pirange;
 
 static void BM_BASELINE(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(dis(gen));
   }
 }
 
 static void BM_FoldRescaleMacro(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(FOLDRESCALE(x, N, pirange));
   }
 }
 
 static void BM_FoldRescaleMacroN(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(FOLDRESCALE(x, N, notPirange));
   }
 }
 
 static void BM_FoldRescale00(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(foldRescale00(x, N, pirange));
   }
 }
 
-
 static void BM_FoldRescale00N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(foldRescale00(x, N, notPirange));
   }
 }
 
-
 static void BM_FoldRescale01(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(foldRescale01(x, N, pirange));
   }
 }
 
-
 static void BM_FoldRescale01N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(foldRescale01(x, N, notPirange));
   }
 }
 
 static void BM_FoldRescale02(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(foldRescale02<true>(x, N));
   }
 }
 
-
 static void BM_FoldRescale02N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(foldRescale02<false>(x, N));
   }
 }
 
-
 static void BM_FoldRescale03(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(foldRescale03<true>(x, N));
   }
 }
 
 static void BM_FoldRescale03N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(foldRescale03<false>(x, N));
   }
 }
 
 static void BM_FoldRescale04(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(FOLDRESCALE04(x, N, pirange));
   }
 }
 
 static void BM_FoldRescale04N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(FOLDRESCALE04(x, N, notPirange));
   }
 }
 
 static void BM_FoldRescale05(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(FOLDRESCALE05(x, N, pirange));
   }
 }
 
 static void BM_FoldRescale05N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(FOLDRESCALE05(x, N, notPirange));
   }
 }
 
-
 static void BM_FoldRescaleVec(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     // Generate 4 floating point numbers
     constexpr auto size = xsimd::batch<FLT>::size;
     std::array<FLT, size> arr;
@@ -220,7 +209,6 @@ static void BM_FoldRescaleVec(benchmark::State &state) {
   }
 }
 
-
 BENCHMARK(BM_BASELINE)->Iterations(10000000);
 BENCHMARK(BM_FoldRescaleMacro)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale00)->Iterations(1000000);
@@ -229,7 +217,7 @@ BENCHMARK(BM_FoldRescale02)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale03)->Iterations(10000000);
 BENCHMARK(BM_FoldRescale04)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale05)->Iterations(1000000);
-BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000/4);
+BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000 / 4);
 BENCHMARK(BM_FoldRescaleMacroN)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale00N)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale01N)->Iterations(1000000);
@@ -238,14 +226,13 @@ BENCHMARK(BM_FoldRescale03N)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale04N)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale05N)->Iterations(1000000);
 
-
 void testFoldRescaleVec() {
   constexpr auto size = xsimd::batch<FLT>::size;
   std::array<FLT, size> xVec;
   for (int i = 0; i < size; ++i) {
     xVec[i] = dis(gen);
   }
-  const auto x = xsimd::load(xVec.data());
+  const auto x      = xsimd::load(xVec.data());
   const auto result = fold_rescale_vec(x, N);
   std::array<FLT, size> resultVec;
   xsimd::store(resultVec.data(), result);
@@ -255,51 +242,59 @@ void testFoldRescaleVec() {
   for (int i = 0; i < size; ++i) {
     double result00 = foldRescale03<true>(xVec[i], N);
     if (std::abs(1 - result00 / resultVec[i]) > 1e-14) {
-      std::cout << "input: " << xVec[i] << " result00: " << result00 << " resultVec: " << resultVec[i] << std::endl;
+      std::cout << "input: " << xVec[i] << " result00: " << result00
+                << " resultVec: " << resultVec[i] << std::endl;
       throw std::runtime_error("foldRescaleVec is not equivalent to foldRescale00");
     }
   }
 }
 
 void testFoldRescaleFunctions() {
-  for (bool p: {false, true}) {
-    for (int i = 0; i < 1024; ++i) {  // Run the test 1000 times
-      FLT x = dis(gen);
+  for (bool p : {false, true}) {
+    for (int i = 0; i < 1024; ++i) { // Run the test 1000 times
+      FLT x           = dis(gen);
       FLT resultMacro = FOLDRESCALE(x, N, p);
-      FLT result00 = foldRescale00(x, N, p);
-      FLT result01 = foldRescale01(x, N, p);
-      FLT result02 = p ? foldRescale02<true>(x, N) : foldRescale02<false>(x, N);
-      FLT result03 = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
-      FLT result04 = FOLDRESCALE04(x, N, p);
-      FLT result05 = FOLDRESCALE05(x, N, p);
-
-      // function that compares two floating point number with a tolerance, using relative error
+      FLT result00    = foldRescale00(x, N, p);
+      FLT result01    = foldRescale01(x, N, p);
+      FLT result02    = p ? foldRescale02<true>(x, N) : foldRescale02<false>(x, N);
+      FLT result03    = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
+      FLT result04    = FOLDRESCALE04(x, N, p);
+      FLT result05    = FOLDRESCALE05(x, N, p);
+
+      // function that compares two floating point number with a tolerance, using relative
+      // error
       auto compare = [](FLT a, FLT b) {
         return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13;
       };
 
       if (compare(resultMacro, result00)) {
-        std::cout << "resultMacro: " << resultMacro << " result00: " << result00 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result00: " << result00
+                  << std::endl;
         throw std::runtime_error("function00 is wrong");
       }
       if (compare(resultMacro, result01)) {
-        std::cout << "resultMacro: " << resultMacro << " result01: " << result01 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result01: " << result01
+                  << std::endl;
         throw std::runtime_error("function01 is wrong");
       }
       if (compare(resultMacro, result02)) {
-        std::cout << "resultMacro: " << resultMacro << " result02: " << result02 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result02: " << result02
+                  << std::endl;
         throw std::runtime_error("function02 is wrong");
       }
       if (compare(resultMacro, result03)) {
-        std::cout << "resultMacro: " << resultMacro << " result03: " << result03 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result03: " << result03
+                  << std::endl;
         throw std::runtime_error("function03 is wrong");
       }
       if (compare(resultMacro, result04)) {
-        std::cout << "resultMacro: " << resultMacro << " result04: " << result04 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result04: " << result04
+                  << std::endl;
         throw std::runtime_error("function04 is wrong");
       }
       if (compare(resultMacro, result05)) {
-        std::cout << "resultMacro: " << resultMacro << " result05: " << result05 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result05: " << result05
+                  << std::endl;
         throw std::runtime_error("function05 is wrong");
       }
     }
@@ -313,7 +308,7 @@ class BaselineSubtractingReporter : public benchmark::ConsoleReporter {
   }
 
   void ReportRuns(const std::vector<Run> &reports) override {
-    for (const auto &run: reports) {
+    for (const auto &run : reports) {
       if (run.benchmark_name() == "BM_BASELINE") {
         baseline_time = run.cpu_accumulated_time;
       } else {
@@ -329,7 +324,7 @@ class BaselineSubtractingReporter : public benchmark::ConsoleReporter {
 };
 
 int main(int argc, char **argv) {
-  pirange = argc & 1;
+  pirange    = argc & 1;
   notPirange = !pirange;
   static std::random_device rd;
   const auto seed = rd();
diff --git a/devel/foldrescale_perf.cpp b/devel/foldrescale_perf.cpp
index 3d423cdba..a4ac38c99 100644
--- a/devel/foldrescale_perf.cpp
+++ b/devel/foldrescale_perf.cpp
@@ -4,7 +4,8 @@
 
    Compile with, eg on linux, double-prec:
 
-   g++ -O3 -funroll-loops -march=native -I../include foldrescale_perf.cpp -o foldrescale_perf
+   g++ -O3 -funroll-loops -march=native -I../include foldrescale_perf.cpp -o
+   foldrescale_perf
 
    Use -DSINGLE for single-prec
 
@@ -35,9 +36,13 @@ using namespace std::chrono;
 #endif
 
 // old coord-handling macro ------------------------------------------------
-#define RESCALE(x,N,p) (p ?                                           \
-                        (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \
-		     (x<0 ? x+N : (x>N ? x-N : x)))
+#define RESCALE(x, N, p)                                                         \
+  (p ? (x * (FLT)M_1_2PI * N +                                                   \
+        (x * (FLT)M_1_2PI * N < -N / (FLT)2.0                                    \
+             ? (FLT)1.5                                                          \
+             : (x * (FLT)M_1_2PI * N > N / (FLT)2.0 ? (FLT) - 0.5 : (FLT)0.5)) * \
+            N)                                                                   \
+     : (x < 0 ? x + N : (x > N ? x - N : x)))
 
 // function equivalent -----------------------------------------------------
 FLT foldrescale(FLT x, BIGINT N, int pirange)
@@ -48,58 +53,68 @@ FLT foldrescale(FLT x, BIGINT N, int pirange)
   // affine rescale...
   FLT z = x;
   if (pirange)
-    z = (N/(2*PI)) * (x+PI);                  // PI is (FLT)M_PI in defs.h
+    z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h
   else
     z = x;
   // fold...
-  if (z<(FLT)0.0)
+  if (z < (FLT)0.0)
     z += (FLT)N;
-  else if (z>=(FLT)N)
+  else if (z >= (FLT)N)
     z -= (FLT)N;
   return z;
-} 
+}
 
 // ==========================================================================
-int main(int argc, char* argv[])
-{
-  int M=100000000;                // default: # pts to test
-  long int N = 1000000;           // default: grid size, doesn't matter
-  
-  if (argc>1) { double w; sscanf(argv[1],"%lf",&w); M = (int)w; }
-  if (argc>2) { double w; sscanf(argv[2],"%lf",&w); N = (long int)w; }
+int main(int argc, char *argv[]) {
+  int M      = 100000000; // default: # pts to test
+  long int N = 1000000;   // default: grid size, doesn't matter
 
-  FLT sum=0.0;
+  if (argc > 1) {
+    double w;
+    sscanf(argv[1], "%lf", &w);
+    M = (int)w;
+  }
+  if (argc > 2) {
+    double w;
+    sscanf(argv[2], "%lf", &w);
+    N = (long int)w;
+  }
+
+  FLT sum     = 0.0;
   auto tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {                     // v predictable x values,
-    FLT x = (FLT)(-10.0) + i*((FLT)20.0/N);   // I hope cheap; let's see!
+  for (int i = 0; i < M; ++i) {                 // v predictable x values,
+    FLT x = (FLT)(-10.0) + i * ((FLT)20.0 / N); // I hope cheap; let's see!
     sum += x;
   }
-  duration<double> dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("backgnd ops:              \t%.3g s/call\t\t(sum:%.12g)\n",dur.count()/M,sum);
+  duration<double> dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("backgnd ops:              \t%.3g s/call\t\t(sum:%.12g)\n", dur.count() / M,
+         sum);
 
   sum = 0.0;
-  for (int pirange=0;pirange<2;++pirange) {
+  for (int pirange = 0; pirange < 2; ++pirange) {
     tbegin = system_clock::now();
-    for (int i=0;i<M;++i) {
-      FLT x = (FLT)(-10.0) + i*((FLT)20.0/N);
-      FLT z = RESCALE(x,N,pirange);
+    for (int i = 0; i < M; ++i) {
+      FLT x = (FLT)(-10.0) + i * ((FLT)20.0 / N);
+      FLT z = RESCALE(x, N, pirange);
       sum += z;
     }
-    dur = system_clock::now() - tbegin;   // dur.count() is sec
-    printf("w/ RESCALE macro (pir=%d):\t%.3g s/call\t\t(sum:%.12g)\n",pirange,dur.count()/M,sum);
+    dur = system_clock::now() - tbegin; // dur.count() is sec
+    printf("w/ RESCALE macro (pir=%d):\t%.3g s/call\t\t(sum:%.12g)\n", pirange,
+           dur.count() / M, sum);
   }
-  
+
   sum = 0.0;
-  for (int pirange=0;pirange<2;++pirange) {
+  for (int pirange = 0; pirange < 2; ++pirange) {
     tbegin = system_clock::now();
-    for (int i=0;i<M;++i) {
-      FLT x = (FLT)(-10.0) + i*((FLT)20.0/N);
-      FLT z = foldrescale(x,N,pirange);
+    for (int i = 0; i < M; ++i) {
+      FLT x = (FLT)(-10.0) + i * ((FLT)20.0 / N);
+      FLT z = foldrescale(x, N, pirange);
       sum += z;
     }
-    dur = system_clock::now() - tbegin;   // dur.count() is sec
-    printf("w/ foldrescale (pir=%d):  \t%.3g s/call\t\t(sum:%.12g)\n",pirange,dur.count()/M,sum);
+    dur = system_clock::now() - tbegin; // dur.count() is sec
+    printf("w/ foldrescale (pir=%d):  \t%.3g s/call\t\t(sum:%.12g)\n", pirange,
+           dur.count() / M, sum);
   }
-  
+
   return 0;
 }
diff --git a/devel/foldrescale_perf2.cpp b/devel/foldrescale_perf2.cpp
index b57eb2746..7088d0e0b 100644
--- a/devel/foldrescale_perf2.cpp
+++ b/devel/foldrescale_perf2.cpp
@@ -4,8 +4,9 @@
 
    Compile with, eg on linux, double-prec:
 
-   g++ -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp
-   g++ -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only
+   g++ -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o
+foldrescale_perf2 -lgomp g++ -O3 -funroll-loops -march=native -I../include -fopenmp
+foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only
 
    Flags: -DSINGLE for single-prec. OMP only used for random # gen.
           -DNOBIN to skip the binning, leaving just fold&rescale.
@@ -24,22 +25,21 @@ BETTER i7 GCC9 RESULTS:  (run ./foldrescale.sh)
 
 BINNING (closer to spreadinterp application):
 
-alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp
-alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
-simple array sum:           	1.9 ns/call	(sum:540.8833119415621)
-simple bin over [-3pi,3pi):  	1.1 ns/call	(ans:100667)
-w/ RESCALE1 macro:       	4.3 ns/call	(sum:499894508.4253364)
-w/ RESCALE macro (pir=0):	6.7 ns/call	(sum:499894508.4253364)
-w/ RESCALE macro (pir=1):	4.5 ns/call	(sum:499894508.4253364)
-w/ foldrescale1:           	8.3 ns/call	(sum:499894508.4253364)
-w/ foldrescale2:           	7.0 ns/call	(sum:499894508.4253364)
-w/ foldrescale3:           	7.0 ns/call	(sum:499894508.4253364)
-w/ foldrescale (pir=0):  	6.7 ns/call	(sum:499894508.4253364)
-w/ foldrescale (pir=1):  	8.2 ns/call	(sum:499894508.4253364)
-						(ans:905754)
-
-alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only
-alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
+alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native
+-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp alex@fiona
+/home/alex/numerics/finufft/devel> ./foldrescale_perf2 simple array sum:           	1.9
+ns/call	(sum:540.8833119415621) simple bin over [-3pi,3pi):  	1.1 ns/call	(ans:100667) w/
+RESCALE1 macro:       	4.3 ns/call	(sum:499894508.4253364) w/ RESCALE macro (pir=0):	6.7
+ns/call	(sum:499894508.4253364) w/ RESCALE macro (pir=1):	4.5 ns/call
+(sum:499894508.4253364) w/ foldrescale1:           	8.3 ns/call	(sum:499894508.4253364) w/
+foldrescale2:           	7.0 ns/call	(sum:499894508.4253364) w/
+foldrescale3:           	7.0 ns/call	(sum:499894508.4253364) w/ foldrescale
+(pir=0):  	6.7 ns/call	(sum:499894508.4253364) w/ foldrescale (pir=1):  	8.2 ns/call
+(sum:499894508.4253364) (ans:905754)
+
+alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native
+-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast
+-fno-finite-math-only alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
 simple array sum:           	0.4 ns/call	(sum:-9554.451222028649)
 simple bin over [-3pi,3pi):  	1.5 ns/call	(ans:100815)
 w/ RESCALE1 macro:       	2.0 ns/call	(sum:499919136.1859143)
@@ -50,35 +50,31 @@ w/ foldrescale2:           	6.7 ns/call	(sum:499919136.1859144)
 w/ foldrescale3:           	7.0 ns/call	(sum:499919136.1859144)
 w/ foldrescale (pir=0):  	6.4 ns/call	(sum:499919136.1859144)
 w/ foldrescale (pir=1):  	8.1 ns/call	(sum:499919136.1859143)
-						(ans:904913)
+            (ans:904913)
 NOBIN:
 
-alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -DNOBIN
-alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
-simple array sum:           	1.3 ns/call	(sum:-5028.023988434961)
-w/ RESCALE1 macro:       	1.3 ns/call	(sum:499984776.5128576)
-w/ RESCALE macro (pir=0):	6.4 ns/call	(sum:499984776.5128576)
-w/ RESCALE macro (pir=1):	1.4 ns/call	(sum:499984776.5128576)
-w/ foldrescale1:           	7.8 ns/call	(sum:499984776.5128576)
-w/ foldrescale2:           	6.2 ns/call	(sum:499984776.5128576)
-w/ foldrescale3:           	6.4 ns/call	(sum:499984776.5128576)
-w/ foldrescale (pir=0):  	6.3 ns/call	(sum:499984776.5128576)
-w/ foldrescale (pir=1):  	8.2 ns/call	(sum:499984776.5128576)
-						(ans:0)
-
-alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only -DNOBIN
-alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
-simple array sum:           	0.4 ns/call	(sum:-14573.38274652959)
-w/ RESCALE1 macro:       	0.7 ns/call	(sum:499926457.4098142)
-w/ RESCALE macro (pir=0):	0.7 ns/call	(sum:499926457.4098142)
-w/ RESCALE macro (pir=1):	0.8 ns/call	(sum:499926457.4098142)
-w/ foldrescale1:           	1.0 ns/call	(sum:499926457.4098143)
-w/ foldrescale2:           	0.8 ns/call	(sum:499926457.4098142)
-w/ foldrescale3:           	0.8 ns/call	(sum:499926457.4098142)
-w/ foldrescale (pir=0):  	0.9 ns/call	(sum:499926457.4098143)
-w/ foldrescale (pir=1):  	1.0 ns/call	(sum:499926457.4098144)
-						(ans:0)
-Concl:
+alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native
+-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -DNOBIN alex@fiona
+/home/alex/numerics/finufft/devel> ./foldrescale_perf2 simple array sum:           	1.3
+ns/call	(sum:-5028.023988434961) w/ RESCALE1 macro:       	1.3 ns/call
+(sum:499984776.5128576) w/ RESCALE macro (pir=0):	6.4 ns/call	(sum:499984776.5128576) w/
+RESCALE macro (pir=1):	1.4 ns/call	(sum:499984776.5128576) w/
+foldrescale1:           	7.8 ns/call	(sum:499984776.5128576) w/
+foldrescale2:           	6.2 ns/call	(sum:499984776.5128576) w/
+foldrescale3:           	6.4 ns/call	(sum:499984776.5128576) w/ foldrescale
+(pir=0):  	6.3 ns/call	(sum:499984776.5128576) w/ foldrescale (pir=1):  	8.2 ns/call
+(sum:499984776.5128576) (ans:0)
+
+alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native
+-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast
+-fno-finite-math-only -DNOBIN alex@fiona /home/alex/numerics/finufft/devel>
+./foldrescale_perf2 simple array sum:           	0.4 ns/call	(sum:-14573.38274652959) w/
+RESCALE1 macro:       	0.7 ns/call	(sum:499926457.4098142) w/ RESCALE macro (pir=0):	0.7
+ns/call	(sum:499926457.4098142) w/ RESCALE macro (pir=1):	0.8 ns/call
+(sum:499926457.4098142) w/ foldrescale1:           	1.0 ns/call	(sum:499926457.4098143) w/
+foldrescale2:           	0.8 ns/call	(sum:499926457.4098142) w/ foldrescale3: 0.8 ns/call
+(sum:499926457.4098142) w/ foldrescale (pir=0):  	0.9 ns/call	(sum:499926457.4098143) w/
+foldrescale (pir=1):  	1.0 ns/call	(sum:499926457.4098144) (ans:0) Concl:
 * foldrescale FUNCTION is only fast when Ofast & NOBIN, really weird.
 * macro *is* faster than function, even modern g++.
 * RESCALE is same as RESCALE1
@@ -118,32 +114,34 @@ can recover isnan handling with -Ofast -fno-finite-math-only     .. good!
 #include "finufft/defs.h"
 
 #include <math.h>
+#include <omp.h>
 #include <stdio.h>
 #include <vector>
-#include <omp.h>
 // let's try the "modern" C++ way to time... yuk...
 #include <chrono>
 using namespace std::chrono;
 
-
 // old coord-handling macro ------------------------------------------------
-//#define RESCALE(x,N,p) (p ?                                           \
-//                        (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \
+// #define RESCALE(x,N,p) (p ?                                           \
+//                        (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 :
+//                        (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \
 //                        (x<(FLT)0.0 ? x+(FLT)N : (x>(FLT)N ? x-(FLT)N : x)))
 // casting makes no difference
 
 // cleaner rewrite, no slower:
-#define RESCALE(x,N,p) (p ?                                             \
-         (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI)) * ((FLT)M_1_2PI*N) : \
-                        (x>=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N))
+#define RESCALE(x, N, p)                                                    \
+  (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) \
+     : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N))
 
 // pirange=1 fixed ver of old coord-handling macro ------------------------
-//#define RESCALE1(x,N) (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5*N : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5*N : (FLT)0.5*N)))
+// #define RESCALE1(x,N) (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5*N :
+// (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5*N : (FLT)0.5*N)))
 // it does matter how written: this made faster...
-//#define RESCALE1(x,N) (x*(FLT)M_1_2PI + (x*(FLT)M_1_2PI<-0.5 ? 1.5 : (x*(FLT)M_1_2PI>0.5 ? -0.5 : 0.5)))*N
-
-#define RESCALE1(x,N) (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI))*((FLT)M_1_2PI*N)
+// #define RESCALE1(x,N) (x*(FLT)M_1_2PI + (x*(FLT)M_1_2PI<-0.5 ? 1.5 :
+// (x*(FLT)M_1_2PI>0.5 ? -0.5 : 0.5)))*N
 
+#define RESCALE1(x, N) \
+  (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N)
 
 // function equivalents -----------------------------------------------------
 static inline FLT foldrescale(FLT x, BIGINT N, int pirange)
@@ -153,184 +151,199 @@ static inline FLT foldrescale(FLT x, BIGINT N, int pirange)
 {
   // affine rescale...
   FLT z = x;
-  if (pirange)
-    z = (N/(2*PI)) * (x+PI);                  // PI is (FLT)M_PI in defs.h
+  if (pirange) z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h
   // fold...
-  if (z<(FLT)0.0)
+  if (z < (FLT)0.0)
     z += (FLT)N;
-  else if (z>=(FLT)N)
+  else if (z >= (FLT)N)
     z -= (FLT)N;
   return z;
-} 
+}
 
 static inline FLT foldrescale1(FLT x, BIGINT N)
 // same as above but hardwired pirange=1. rescale then fold
 {
   // affine rescale always...
-  FLT z = (N/(2*PI)) * (x+PI);                  // PI is (FLT)M_PI in defs.h
+  FLT z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h
   // fold...
-  if (z<(FLT)0.0)
+  if (z < (FLT)0.0)
     z += (FLT)N;
-  else if (z>=(FLT)N)
+  else if (z >= (FLT)N)
     z -= (FLT)N;
   return z;
-} 
+}
 
 static inline FLT foldrescale2(FLT x, BIGINT N)
 // same as above but hardwired pirange=1, flip so fold done before rescale
 {
-  if (x<-PI)
-    x += 2*PI;
-  else if (x>PI)
-    x -= 2*PI;
-  return (N/(2*PI)) * (x+PI);
-} 
+  if (x < -PI)
+    x += 2 * PI;
+  else if (x > PI)
+    x -= 2 * PI;
+  return (N / (2 * PI)) * (x + PI);
+}
 
 static inline FLT foldrescale3(FLT x, BIGINT N)
 // same as above but hardwired pirange=1, flip so fold done before rescale
 {
-  if (x<-PI)
-    x += 3*PI;
-  else if (x>PI)
+  if (x < -PI)
+    x += 3 * PI;
+  else if (x > PI)
     x -= PI;
   else
     x += PI;
-  return (N/(2*PI)) * x;
+  return (N / (2 * PI)) * x;
 }
 
-
-
 // ==========================================================================
-int main(int argc, char* argv[])
-{
-  int M=10000000;                 // default: # pts to test (>=1e7 is acc)
-  int N = 100;                    // grid size, matters that unknown @ compile
-  
-  if (argc>1) { double w; sscanf(argv[1],"%lf",&w); M = (int)w; }
-  if (argc>2) { double w; sscanf(argv[2],"%lf",&w); N = (int)w; }  
-  std::vector<int> c(N,0);        // let's do basic binning while we're at it
-                                  // to prevent compiler optims
-  int maxc=0;                     // use for max bin count
-   
+int main(int argc, char *argv[]) {
+  int M = 10000000; // default: # pts to test (>=1e7 is acc)
+  int N = 100;      // grid size, matters that unknown @ compile
+
+  if (argc > 1) {
+    double w;
+    sscanf(argv[1], "%lf", &w);
+    M = (int)w;
+  }
+  if (argc > 2) {
+    double w;
+    sscanf(argv[2], "%lf", &w);
+    N = (int)w;
+  }
+  std::vector<int> c(N, 0); // let's do basic binning while we're at it
+                            // to prevent compiler optims
+  int maxc = 0;             // use for max bin count
+
   // fill array w/ random #s (in par), deterministic seeds based on threads
   std::vector<FLT> x(M);
 #pragma omp parallel
   {
-    unsigned int s=omp_get_thread_num();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,1000000)
-    for (int i=0; i<M; ++i)
-      x[i] = 3.0*PI*randm11r(&s);          // unif over the folded domain
+    unsigned int s = omp_get_thread_num(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, 1000000)
+    for (int i = 0; i < M; ++i)
+      x[i] = 3.0 * PI * randm11r(&s); // unif over the folded domain
   }
   // (note when pirange=0 the conditional <0 vs >=0 still 1:2 random)
   // We'll reuse this array by rescaling/unrescaling by hand.
-  
-  FLT sum=0.0;
+
+  FLT sum     = 0.0;
   auto tbegin = system_clock::now();
-  for (int i=0;i<M;++i)
-    sum += x[i];                          // simply sweep through array
-  duration<double> dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("simple array sum:           \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  for (int i = 0; i < M; ++i) sum += x[i];             // simply sweep through array
+  duration<double> dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("simple array sum:           \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
 #ifndef NOBIN
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    int b = (int)(N*((1.0/(6*PI))*x[i] + (FLT)0.5));   // in {0,..,N-1}
+  for (int i = 0; i < M; ++i) {
+    int b = (int)(N * ((1.0 / (6 * PI)) * x[i] + (FLT)0.5)); // in {0,..,N-1}
     ++c[b];
-    //if (b<0 || b>=N) printf("b[%d]=%d (x=%.16g, flt b=%.16g)\n",i,b,x[i],N*((1.0/(6*PI))*x[i] + 0.5));  // chk all indices ok!
+    // if (b<0 || b>=N) printf("b[%d]=%d (x=%.16g, flt
+    // b=%.16g)\n",i,b,x[i],N*((1.0/(6*PI))*x[i] + 0.5));  // chk all indices ok!
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  for(int b=0;b<N;++b) if (c[b]>maxc) maxc=c[b];   // somehow use it
-  printf("simple bin over [-3pi,3pi):  \t%.1f ns/call\t(ans:%d)\n",1e9*dur.count()/(double)M,maxc);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  for (int b = 0; b < N; ++b)
+    if (c[b] > maxc) maxc = c[b];     // somehow use it
+  printf("simple bin over [-3pi,3pi):  \t%.1f ns/call\t(ans:%d)\n",
+         1e9 * dur.count() / (double)M, maxc);
 #endif
-  
-  sum = 0.0;    // hardwired pirange=1 MACRO.......................
+
+  sum    = 0.0; // hardwired pirange=1 MACRO.......................
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    FLT z = RESCALE1(x[i],N);
+  for (int i = 0; i < M; ++i) {
+    FLT z = RESCALE1(x[i], N);
     sum += z;
 #ifndef NOBIN
-    ++c[(int)z];       // bin it
+    ++c[(int)z]; // bin it
 #endif
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("w/ RESCALE1 macro:       \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("w/ RESCALE1 macro:       \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
-  for (int pirange=0;pirange<2;++pirange) {
+  for (int pirange = 0; pirange < 2; ++pirange) {
     if (!pirange)
-      for (int i=0;i<M;++i) x[i] = (N/(2*PI)) * (x[i]+PI);   // rescale to [0,N)
-    //FLT mx=0.0; for (int i=0;i<M;++i) if (x[i]>mx) mx=x[i];   // chk max
-    //printf("max x=%.3g\n",mx);
-    sum = 0.0;
+      for (int i = 0; i < M; ++i) x[i] = (N / (2 * PI)) * (x[i] + PI); // rescale to [0,N)
+    // FLT mx=0.0; for (int i=0;i<M;++i) if (x[i]>mx) mx=x[i];   // chk max
+    // printf("max x=%.3g\n",mx);
+    sum    = 0.0;
     tbegin = system_clock::now();
-    for (int i=0;i<M;++i) {
-      FLT z = RESCALE(x[i],N,pirange);
+    for (int i = 0; i < M; ++i) {
+      FLT z = RESCALE(x[i], N, pirange);
       sum += z;
 #ifndef NOBIN
-      ++c[(int)z];       // bin it
+      ++c[(int)z]; // bin it
 #endif
     }
-    dur = system_clock::now() - tbegin;   // dur.count() is sec
-    printf("w/ RESCALE macro (pir=%d):\t%.1f ns/call\t(sum:%.16g)\n",pirange,1e9*dur.count()/(double)M,sum);
+    dur = system_clock::now() - tbegin; // dur.count() is sec
+    printf("w/ RESCALE macro (pir=%d):\t%.1f ns/call\t(sum:%.16g)\n", pirange,
+           1e9 * dur.count() / (double)M, sum);
     if (!pirange)
-      for (int i=0;i<M;++i) x[i] = x[i]*((2*PI)/N) - PI;   // undo rescale
+      for (int i = 0; i < M; ++i) x[i] = x[i] * ((2 * PI) / N) - PI; // undo rescale
   }
-  
-  sum = 0.0;    // hardwired pirange=1 FUNC.......................
+
+  sum    = 0.0; // hardwired pirange=1 FUNC.......................
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    FLT z = foldrescale1(x[i],N);
+  for (int i = 0; i < M; ++i) {
+    FLT z = foldrescale1(x[i], N);
     sum += z;
 #ifndef NOBIN
-    ++c[(int)z];       // bin it
+    ++c[(int)z]; // bin it
 #endif
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("w/ foldrescale1:           \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("w/ foldrescale1:           \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
-  sum = 0.0;    // hardwired pirange=1 FUNC.......................
+  sum    = 0.0; // hardwired pirange=1 FUNC.......................
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    FLT z = foldrescale2(x[i],N);
+  for (int i = 0; i < M; ++i) {
+    FLT z = foldrescale2(x[i], N);
     sum += z;
 #ifndef NOBIN
-    ++c[(int)z];       // bin it
+    ++c[(int)z]; // bin it
 #endif
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("w/ foldrescale2:           \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("w/ foldrescale2:           \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
-  sum = 0.0;    // hardwired pirange=1 FUNC.......................
+  sum    = 0.0; // hardwired pirange=1 FUNC.......................
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    FLT z = foldrescale3(x[i],N);
+  for (int i = 0; i < M; ++i) {
+    FLT z = foldrescale3(x[i], N);
     sum += z;
 #ifndef NOBIN
-    ++c[(int)z];       // bin it
+    ++c[(int)z]; // bin it
 #endif
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("w/ foldrescale3:           \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("w/ foldrescale3:           \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
-  for (int pirange=0;pirange<2;++pirange) {
+  for (int pirange = 0; pirange < 2; ++pirange) {
     if (!pirange)
-      for (int i=0;i<M;++i) x[i] = (N/(2*PI)) * (x[i]+PI);   // rescale to [0,N)
-    sum = 0.0;
+      for (int i = 0; i < M; ++i) x[i] = (N / (2 * PI)) * (x[i] + PI); // rescale to [0,N)
+    sum    = 0.0;
     tbegin = system_clock::now();
-    for (int i=0;i<M;++i) {
-      FLT z = foldrescale(x[i],N,pirange);
+    for (int i = 0; i < M; ++i) {
+      FLT z = foldrescale(x[i], N, pirange);
       sum += z;
 #ifndef NOBIN
-      ++c[(int)z];       // bin it
+      ++c[(int)z]; // bin it
 #endif
     }
-    dur = system_clock::now() - tbegin;   // dur.count() is sec
-    printf("w/ foldrescale (pir=%d):  \t%.1f ns/call\t(sum:%.16g)\n",pirange,1e9*dur.count()/(double)M,sum);
+    dur = system_clock::now() - tbegin; // dur.count() is sec
+    printf("w/ foldrescale (pir=%d):  \t%.1f ns/call\t(sum:%.16g)\n", pirange,
+           1e9 * dur.count() / (double)M, sum);
     if (!pirange)
-      for (int i=0;i<M;++i) x[i] = x[i]*((2*PI)/N) - PI;   // undo rescale
+      for (int i = 0; i < M; ++i) x[i] = x[i] * ((2 * PI) / N) - PI; // undo rescale
   }
 
   // force it to not optimize away the bin filling steps:
-  maxc=0; for(int b=0;b<N;++b) if (c[b]>maxc) maxc=c[b];   // somehow use it
-  printf("\t\t\t\t\t\t(ans:%d)\n",maxc);
+  maxc = 0;
+  for (int b = 0; b < N; ++b)
+    if (c[b] > maxc) maxc = c[b]; // somehow use it
+  printf("\t\t\t\t\t\t(ans:%d)\n", maxc);
   return 0;
 }
diff --git a/devel/interp_square_nowrap.cpp b/devel/interp_square_nowrap.cpp
index d17b32a89..8cd3758b5 100644
--- a/devel/interp_square_nowrap.cpp
+++ b/devel/interp_square_nowrap.cpp
@@ -1,29 +1,31 @@
 // this is code I was messing with timing using time2d2interp.cpp
 // around May 3, 2018, to figure how wrapping was slowing down spreading.
 
-void interp_square_nowrap(FLT *out,FLT *du, FLT *ker1, FLT *ker2, BIGINT i1,BIGINT i2,BIGINT N1,BIGINT N2,int ns)
+void interp_square_nowrap(FLT *out, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2,
+                          BIGINT N1, BIGINT N2, int ns)
 // *************** don't periodic wrap, avoid ptrs. correct if no NU pts nr edge
 {
-  out[0] = 0.0; out[1] = 0.0;
-  if (0) {  // plain
-    for (int dy=0; dy<ns; dy++) {
-      BIGINT j = N1*(i2+dy) + i1;
-      for (int dx=0; dx<ns; dx++) {
-	FLT k = ker1[dx]*ker2[dy];
-	out[0] += du[2*j] * k;
-	out[1] += du[2*j+1] * k;
-	++j;
+  out[0] = 0.0;
+  out[1] = 0.0;
+  if (0) { // plain
+    for (int dy = 0; dy < ns; dy++) {
+      BIGINT j = N1 * (i2 + dy) + i1;
+      for (int dx = 0; dx < ns; dx++) {
+        FLT k = ker1[dx] * ker2[dy];
+        out[0] += du[2 * j] * k;
+        out[1] += du[2 * j + 1] * k;
+        ++j;
       }
     }
   } else {
-   for (int dy=0; dy<ns; dy++) {
-      BIGINT j = N1*(i2+dy) + i1;
-      //#pragma omp simd
-      for (int dx=0; dx<ns; dx++) {
-	FLT k = ker1[dx]*ker2[dy];
-	out[0] += du[2*j] * k;
-	out[1] += du[2*j+1] * k;
-	++j;
+    for (int dy = 0; dy < ns; dy++) {
+      BIGINT j = N1 * (i2 + dy) + i1;
+      // #pragma omp simd
+      for (int dx = 0; dx < ns; dx++) {
+        FLT k = ker1[dx] * ker2[dy];
+        out[0] += du[2 * j] * k;
+        out[1] += du[2 * j + 1] * k;
+        ++j;
       }
     }
   }
diff --git a/devel/padding.cpp b/devel/padding.cpp
index 846e85764..0366ede73 100644
--- a/devel/padding.cpp
+++ b/devel/padding.cpp
@@ -1,34 +1,27 @@
 //
 // Created by mbarbone on 5/17/24.
 //
-#include <xsimd/xsimd.hpp>
 #include <cstdint>
-#include <type_traits>
 #include <iostream>
+#include <type_traits>
+#include <xsimd/xsimd.hpp>
 
-template<class T, uint16_t N, uint16_t K = N>
-static constexpr auto BestSIMDHelper();
+template<class T, uint16_t N, uint16_t K = N> static constexpr auto BestSIMDHelper();
 
-template<class T, uint16_t N>
-static constexpr auto GetPaddedSIMDSize();
+template<class T, uint16_t N> static constexpr auto GetPaddedSIMDSize();
 
-template<class T>
-static uint16_t get_padding(uint16_t ns);
+template<class T> static uint16_t get_padding(uint16_t ns);
 
-template<class T, uint16_t ns>
-static constexpr auto get_padding();
+template<class T, uint16_t ns> static constexpr auto get_padding();
 
 template<class T, uint16_t N>
 using BestSIMD = typename decltype(BestSIMDHelper<T, N, xsimd::batch<T>::size>())::type;
 
-template<class T, uint16_t N = 1>
-static constexpr uint16_t min_batch_size();
+template<class T, uint16_t N = 1> static constexpr uint16_t min_batch_size();
 
-template<class T, uint16_t N = min_batch_size<T>()>
-constexpr uint16_t max_batch_size();
+template<class T, uint16_t N = min_batch_size<T>()> constexpr uint16_t max_batch_size();
 
-template<class T, uint16_t N>
-static constexpr auto find_optimal_batch_size();
+template<class T, uint16_t N> static constexpr auto find_optimal_batch_size();
 
 // below there is some trickery to obtain the padded SIMD type to vectorize
 // the given number of elements.
@@ -36,55 +29,50 @@ static constexpr auto find_optimal_batch_size();
 // or on older ones... "compiler internal error please report"
 // you have been warned.
 
-template<class T, uint16_t N, uint16_t K>
-static constexpr auto BestSIMDHelper() {
+template<class T, uint16_t N, uint16_t K> static constexpr auto BestSIMDHelper() {
   if constexpr (N % K == 0) { // returns void in the worst case
     return xsimd::make_sized_batch<T, K>{};
   } else {
-    return BestSIMDHelper<T, N, (K>>1)>();
+    return BestSIMDHelper<T, N, (K >> 1)>();
   }
 }
 
-template<class T, uint16_t N>
-constexpr uint16_t min_batch_size() {
+template<class T, uint16_t N> constexpr uint16_t min_batch_size() {
   if constexpr (std::is_void_v<xsimd::make_sized_batch_t<T, N>>) {
-    return min_batch_size<T, N*2>();
+    return min_batch_size<T, N * 2>();
   } else {
     return N;
   }
 };
 
-template<class T, uint16_t N>
-constexpr uint16_t max_batch_size() {
-  if constexpr (!std::is_void_v<xsimd::make_sized_batch_t<T, N*2>>) {
-    return max_batch_size<T, N*2>();
+template<class T, uint16_t N> constexpr uint16_t max_batch_size() {
+  if constexpr (!std::is_void_v<xsimd::make_sized_batch_t<T, N * 2>>) {
+    return max_batch_size<T, N * 2>();
   } else {
     return N;
   }
 };
 
-template<class T, uint16_t N>
-static constexpr auto find_optimal_batch_size() {
-  uint16_t min_iterations = N;
+template<class T, uint16_t N> static constexpr auto find_optimal_batch_size() {
+  uint16_t min_iterations     = N;
   uint16_t optimal_batch_size = 1;
-  for (uint16_t batch_size = min_batch_size<T>(); batch_size <= xsimd::batch<T>::size; batch_size *= 2) {
+  for (uint16_t batch_size = min_batch_size<T>(); batch_size <= xsimd::batch<T>::size;
+       batch_size *= 2) {
     uint16_t iterations = (N + batch_size - 1) / batch_size;
     if (iterations < min_iterations) {
-      min_iterations = iterations;
+      min_iterations     = iterations;
       optimal_batch_size = batch_size;
     }
   }
   return optimal_batch_size;
 }
 
-template<class T, uint16_t N>
-static constexpr auto GetPaddedSIMDSize() {
+template<class T, uint16_t N> static constexpr auto GetPaddedSIMDSize() {
   static_assert(N < 128);
-    return xsimd::make_sized_batch<T, find_optimal_batch_size<T, N>()>::type::size;
+  return xsimd::make_sized_batch<T, find_optimal_batch_size<T, N>()>::type::size;
 }
 
-template<class T, uint16_t ns>
-static constexpr auto get_padding() {
+template<class T, uint16_t ns> static constexpr auto get_padding() {
   constexpr uint16_t width = GetPaddedSIMDSize<T, ns>();
   return ns % width == 0 ? 0 : width - (ns % width);
 }
@@ -102,13 +90,11 @@ static constexpr auto get_padding_helper(uint16_t runtime_ns) {
   }
 }
 
-template<class T>
-static uint16_t get_padding(uint16_t ns) {
+template<class T> static uint16_t get_padding(uint16_t ns) {
   return get_padding_helper<T, 32>(ns);
 }
 
-template<class T>
-std::ostream & print(T arg) {
+template<class T> std::ostream &print(T arg) {
   typename T::value_type sum = 0;
   for (const auto &elem : arg) {
     std::cout << elem << " ";
@@ -118,34 +104,31 @@ std::ostream & print(T arg) {
   return std::cout;
 }
 
-
-template<uint16_t low, uint16_t high>
-constexpr uint16_t po2_in_between() {
+template<uint16_t low, uint16_t high> constexpr uint16_t po2_in_between() {
   std::uint16_t result = 0;
-  for (auto i = low; i <= high; i<<=1 ) {
+  for (auto i = low; i <= high; i <<= 1) {
     result++;
   }
   return result;
 }
 
-template<class T, uint16_t N>
-constexpr auto mixed_vectors() {
+template<class T, uint16_t N> constexpr auto mixed_vectors() {
   constexpr auto min_batch = min_batch_size<T>();
   constexpr auto max_batch = max_batch_size<T>();
   // compute all the power of 2 between min_batch and max_batch
 
-  std::array<uint16_t, po2_in_between<min_batch, max_batch>()+1> batch_sizes{1};
+  std::array<uint16_t, po2_in_between<min_batch, max_batch>() + 1> batch_sizes{1};
   for (uint16_t i = 1; i < batch_sizes.size(); i++) {
     batch_sizes[i] = min_batch << (i - 1);
   }
   print(batch_sizes);
-  std::array<uint16_t, N+1> chosen_batch_sizes{0}, dp{N+1};
-  dp[0] = 0;  // 0 amount requires 0 coins
+  std::array<uint16_t, N + 1> chosen_batch_sizes{0}, dp{N + 1};
+  dp[0] = 0; // 0 amount requires 0 coins
 
-  for (uint16_t i = 0; i < N+1; ++i) {
+  for (uint16_t i = 0; i < N + 1; ++i) {
     for (const auto batch_size : batch_sizes) {
       if (batch_size <= i && dp[i - batch_size] + 1 < dp[i]) {
-        dp[i] = dp[i - batch_size] + 1;
+        dp[i]                 = dp[i - batch_size] + 1;
         chosen_batch_sizes[i] = batch_size;
       }
     }
@@ -160,8 +143,6 @@ constexpr auto mixed_vectors() {
   return sequence;
 }
 
-
-
 int main(int argc, char *argv[]) {
   std::cout << "sequence for 16 single precision is ";
   print(mixed_vectors<float, 16>()) << std::endl;
@@ -183,86 +164,140 @@ int main(int argc, char *argv[]) {
   std::cout << "sequence for 18 double precision is ";
   print(mixed_vectors<double, 18>()) << std::endl;
 
-
   std::cout << "sequence for 31 single precision is ";
   print(mixed_vectors<float, 31>()) << std::endl;
   std::cout << "sequence for 31 double precision is ";
   print(mixed_vectors<double, 31>()) << std::endl;
 
-  std::cout << "Min batch size for single precision is " << uint64_t(min_batch_size<float>()) << std::endl;
-  std::cout << "Max batch size for single precision is " << uint64_t(max_batch_size<float>()) << std::endl;
-  std::cout << "Min batch size for double precision is " << uint64_t(min_batch_size<double>()) << std::endl;
-  std::cout << "Max batch size for double precision is " << uint64_t(max_batch_size<double>()) << std::endl;
+  std::cout << "Min batch size for single precision is "
+            << uint64_t(min_batch_size<float>()) << std::endl;
+  std::cout << "Max batch size for single precision is "
+            << uint64_t(max_batch_size<float>()) << std::endl;
+  std::cout << "Min batch size for double precision is "
+            << uint64_t(min_batch_size<double>()) << std::endl;
+  std::cout << "Max batch size for double precision is "
+            << uint64_t(max_batch_size<double>()) << std::endl;
 
   std::cout << "Best SIMD single precision" << std::endl;
-  std::cout << "SIMD for " <<  4 << " is " << uint64_t(BestSIMD<float,  4>::size) << std::endl;
-  std::cout << "SIMD for " <<  8 << " is " << uint64_t(BestSIMD<float,  8>::size) << std::endl;
-  std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD<float, 12>::size) << std::endl;
-  std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD<float, 16>::size) << std::endl;
-  std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD<float, 20>::size) << std::endl;
-  std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD<float, 24>::size) << std::endl;
-  std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD<float, 28>::size) << std::endl;
-  std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD<float, 32>::size) << std::endl;
+  std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD<float, 4>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 8 << " is " << uint64_t(BestSIMD<float, 8>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD<float, 12>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD<float, 16>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD<float, 20>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD<float, 24>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD<float, 28>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD<float, 32>::size)
+            << std::endl;
 
   std::cout << "Best SIMD double precision" << std::endl;
-  std::cout << "SIMD for " <<  4 << " is " << uint64_t(BestSIMD<double,  4>::size)  << std::endl;
-  std::cout << "SIMD for " <<  8 << " is " << uint64_t(BestSIMD<double,  8>::size)  << std::endl;
-  std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD<double, 12>::size)  << std::endl;
-  std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD<double, 16>::size)  << std::endl;
-  std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD<double, 20>::size)  << std::endl;
-  std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD<double, 24>::size)  << std::endl;
-  std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD<double, 28>::size)  << std::endl;
-  std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD<double, 32>::size)  << std::endl;
+  std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD<double, 4>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 8 << " is " << uint64_t(BestSIMD<double, 8>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD<double, 12>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD<double, 16>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD<double, 20>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD<double, 24>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD<double, 28>::size)
+            << std::endl;
+  std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD<double, 32>::size)
+            << std::endl;
 
   std::cout << "Padded SIMD single precision" << std::endl;
-  std::cout << "Padded SIMD for " <<  4 << " is " << uint64_t(GetPaddedSIMDSize<float,  4>()) << std::endl;
-  std::cout << "Padded SIMD for " <<  6 << " is " << uint64_t(GetPaddedSIMDSize<float,  6>()) << std::endl;
-  std::cout << "Padded SIMD for " << 10 << " is " << uint64_t(GetPaddedSIMDSize<float, 10>()) << std::endl;
-  std::cout << "Padded SIMD for " << 12 << " is " << uint64_t(GetPaddedSIMDSize<float, 12>()) << std::endl;
-  std::cout << "Padded SIMD for " << 15 << " is " << uint64_t(GetPaddedSIMDSize<float, 15>()) << std::endl;
-  std::cout << "Padded SIMD for " << 18 << " is " << uint64_t(GetPaddedSIMDSize<float, 18>()) << std::endl;
-  std::cout << "Padded SIMD for " << 22 << " is " << uint64_t(GetPaddedSIMDSize<float, 22>()) << std::endl;
-  std::cout << "Padded SIMD for " << 26 << " is " << uint64_t(GetPaddedSIMDSize<float, 26>()) << std::endl;
-  std::cout << "Padded SIMD for " << 30 << " is " << uint64_t(GetPaddedSIMDSize<float, 30>()) << std::endl;
-  std::cout << "Padded SIMD for " << 32 << " is " << uint64_t(GetPaddedSIMDSize<float, 32>()) << std::endl;
+  std::cout << "Padded SIMD for " << 4 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 4>()) << std::endl;
+  std::cout << "Padded SIMD for " << 6 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 6>()) << std::endl;
+  std::cout << "Padded SIMD for " << 10 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 10>()) << std::endl;
+  std::cout << "Padded SIMD for " << 12 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 12>()) << std::endl;
+  std::cout << "Padded SIMD for " << 15 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 15>()) << std::endl;
+  std::cout << "Padded SIMD for " << 18 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 18>()) << std::endl;
+  std::cout << "Padded SIMD for " << 22 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 22>()) << std::endl;
+  std::cout << "Padded SIMD for " << 26 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 26>()) << std::endl;
+  std::cout << "Padded SIMD for " << 30 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 30>()) << std::endl;
+  std::cout << "Padded SIMD for " << 32 << " is "
+            << uint64_t(GetPaddedSIMDSize<float, 32>()) << std::endl;
 
   std::cout << "Padded SIMD double precision" << std::endl;
-  std::cout << "Padded SIMD for " <<  4 << " is " << uint64_t(GetPaddedSIMDSize<double,  4>())  << std::endl;
-  std::cout << "Padded SIMD for " <<  6 << " is " << uint64_t(GetPaddedSIMDSize<double,  6>())  << std::endl;
-  std::cout << "Padded SIMD for " << 10 << " is " << uint64_t(GetPaddedSIMDSize<double, 10>())  << std::endl;
-  std::cout << "Padded SIMD for " << 12 << " is " << uint64_t(GetPaddedSIMDSize<double, 12>())  << std::endl;
-  std::cout << "Padded SIMD for " << 15 << " is " << uint64_t(GetPaddedSIMDSize<double, 15>())  << std::endl;
-  std::cout << "Padded SIMD for " << 18 << " is " << uint64_t(GetPaddedSIMDSize<double, 18>())  << std::endl;
-  std::cout << "Padded SIMD for " << 22 << " is " << uint64_t(GetPaddedSIMDSize<double, 22>())  << std::endl;
-  std::cout << "Padded SIMD for " << 26 << " is " << uint64_t(GetPaddedSIMDSize<double, 26>())  << std::endl;
-  std::cout << "Padded SIMD for " << 30 << " is " << uint64_t(GetPaddedSIMDSize<double, 30>())  << std::endl;
-  std::cout << "Padded SIMD for " << 32 << " is " << uint64_t(GetPaddedSIMDSize<double, 32>())  << std::endl;
+  std::cout << "Padded SIMD for " << 4 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 4>()) << std::endl;
+  std::cout << "Padded SIMD for " << 6 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 6>()) << std::endl;
+  std::cout << "Padded SIMD for " << 10 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 10>()) << std::endl;
+  std::cout << "Padded SIMD for " << 12 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 12>()) << std::endl;
+  std::cout << "Padded SIMD for " << 15 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 15>()) << std::endl;
+  std::cout << "Padded SIMD for " << 18 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 18>()) << std::endl;
+  std::cout << "Padded SIMD for " << 22 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 22>()) << std::endl;
+  std::cout << "Padded SIMD for " << 26 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 26>()) << std::endl;
+  std::cout << "Padded SIMD for " << 30 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 30>()) << std::endl;
+  std::cout << "Padded SIMD for " << 32 << " is "
+            << uint64_t(GetPaddedSIMDSize<double, 32>()) << std::endl;
 
   std::cout << "single precision" << std::endl;
-  for(auto i = 2; i < 16; i++){
-    std::cout << "Padding for " << i*2 << " is " << uint64_t(get_padding<float>(i*2)) << std::endl;
+  for (auto i = 2; i < 16; i++) {
+    std::cout << "Padding for " << i * 2 << " is " << uint64_t(get_padding<float>(i * 2))
+              << std::endl;
   }
 
   std::cout << "double precision" << std::endl;
-  for(auto i = 2; i < 16; i++){
-    std::cout << "Padding for " << i*2 << " is " << uint64_t(get_padding<double>(i*2)) << std::endl;
+  for (auto i = 2; i < 16; i++) {
+    std::cout << "Padding for " << i * 2 << " is " << uint64_t(get_padding<double>(i * 2))
+              << std::endl;
   }
 
   std::cout << "single precision" << std::endl;
-  std::cout << "Padding for " <<  3 * 2 << " is " << uint64_t(get_padding<float,  3 * 2>()) << std::endl;
-  std::cout << "Padding for " <<  5 * 2 << " is " << uint64_t(get_padding<float,  5 * 2>()) << std::endl;
-  std::cout << "Padding for " <<  9 * 2 << " is " << uint64_t(get_padding<float,  9 * 2>()) << std::endl;
-  std::cout << "Padding for " << 11 * 2 << " is " << uint64_t(get_padding<float, 11 * 2>()) << std::endl;
-  std::cout << "Padding for " << 13 * 2 << " is " << uint64_t(get_padding<float, 13 * 2>()) << std::endl;
-  std::cout << "Padding for " << 15 * 2 << " is " << uint64_t(get_padding<float, 15 * 2>()) << std::endl;
+  std::cout << "Padding for " << 3 * 2 << " is " << uint64_t(get_padding<float, 3 * 2>())
+            << std::endl;
+  std::cout << "Padding for " << 5 * 2 << " is " << uint64_t(get_padding<float, 5 * 2>())
+            << std::endl;
+  std::cout << "Padding for " << 9 * 2 << " is " << uint64_t(get_padding<float, 9 * 2>())
+            << std::endl;
+  std::cout << "Padding for " << 11 * 2 << " is "
+            << uint64_t(get_padding<float, 11 * 2>()) << std::endl;
+  std::cout << "Padding for " << 13 * 2 << " is "
+            << uint64_t(get_padding<float, 13 * 2>()) << std::endl;
+  std::cout << "Padding for " << 15 * 2 << " is "
+            << uint64_t(get_padding<float, 15 * 2>()) << std::endl;
   std::cout << "double precision" << std::endl;
-  std::cout << "Padding for " <<  3*2 << " is " << uint64_t(get_padding<double,  3 * 2>()) << std::endl;
-  std::cout << "Padding for " <<  5*2 << " is " << uint64_t(get_padding<double,  5 * 2>()) << std::endl;
-  std::cout << "Padding for " <<  7*2 << " is " << uint64_t(get_padding<double,  7 * 2>()) << std::endl;
-  std::cout << "Padding for " <<  9*2 << " is " << uint64_t(get_padding<double,  9 * 2>()) << std::endl;
-  std::cout << "Padding for " << 11*2 << " is " << uint64_t(get_padding<double, 11 * 2>()) << std::endl;
-  std::cout << "Padding for " << 13*2 << " is " << uint64_t(get_padding<double, 13 * 2>()) << std::endl;
-  std::cout << "Padding for " << 15*2 << " is " << uint64_t(get_padding<double, 15 * 2>()) << std::endl;
+  std::cout << "Padding for " << 3 * 2 << " is " << uint64_t(get_padding<double, 3 * 2>())
+            << std::endl;
+  std::cout << "Padding for " << 5 * 2 << " is " << uint64_t(get_padding<double, 5 * 2>())
+            << std::endl;
+  std::cout << "Padding for " << 7 * 2 << " is " << uint64_t(get_padding<double, 7 * 2>())
+            << std::endl;
+  std::cout << "Padding for " << 9 * 2 << " is " << uint64_t(get_padding<double, 9 * 2>())
+            << std::endl;
+  std::cout << "Padding for " << 11 * 2 << " is "
+            << uint64_t(get_padding<double, 11 * 2>()) << std::endl;
+  std::cout << "Padding for " << 13 * 2 << " is "
+            << uint64_t(get_padding<double, 13 * 2>()) << std::endl;
+  std::cout << "Padding for " << 15 * 2 << " is "
+            << uint64_t(get_padding<double, 15 * 2>()) << std::endl;
 
   return 0;
 }
\ No newline at end of file
diff --git a/devel/test_ker_ppval.cpp b/devel/test_ker_ppval.cpp
index e8089121e..5c2131542 100644
--- a/devel/test_ker_ppval.cpp
+++ b/devel/test_ker_ppval.cpp
@@ -4,7 +4,8 @@
 For dyn linked:
 g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp
 For statically linked so can control glibc (avoid Matlab calling being different):
-g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp -static -lmvec
+g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp
+-static -lmvec
 
 For GCC vectorization info: -fopt-info
 
@@ -29,46 +30,43 @@ I have even seen 1e-7 error for w=12 (which should be good to 1e-11)
 
 Demo that sscanf for w can speed plain eval but slow some magic horner speed:
 
-alex@fiona /home/alex/numerics/finufft/devel> g++-7 test_ker_ppval.cpp -o test_ker_ppval -Ofast -march=native -funroll-loops -fopenmp
-WITHOUT SSCANF FOR w:
-alex@fiona /home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000
-acc test: sup err scaled to kernel peak of 1: 6.53e-11
-exp(sqrt): M=10000000 w=12 in 1.03 s:	116 Meval/s (ans=2.73717868002952e+19)
-Horner:    M=10000000 w=12 in 0.0812 s:	1.48e+03 Meval/s (ans=2.73717867964406e+19)
-rel err in sum = 1.41e-10
-WITH SSCANF FOR w:
-alex@fiona /home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000
-acc test: sup err scaled to kernel peak of 1: 6.53e-11
-exp(sqrt): M=10000000 w=12 in 0.45 s:	267 Meval/s (ans=2.73717867952762e+19)
-Horner:    M=10000000 w=12 in 0.483 s:	248 Meval/s (ans=2.73717867952754e+19)
-rel err in sum = 3.01e-14
+alex@fiona /home/alex/numerics/finufft/devel> g++-7 test_ker_ppval.cpp -o test_ker_ppval
+-Ofast -march=native -funroll-loops -fopenmp WITHOUT SSCANF FOR w: alex@fiona
+/home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000 acc test: sup err scaled to
+kernel peak of 1: 6.53e-11 exp(sqrt): M=10000000 w=12 in 1.03 s:	116 Meval/s
+(ans=2.73717868002952e+19) Horner:    M=10000000 w=12 in 0.0812 s:	1.48e+03 Meval/s
+(ans=2.73717867964406e+19) rel err in sum = 1.41e-10 WITH SSCANF FOR w: alex@fiona
+/home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000 acc test: sup err scaled to
+kernel peak of 1: 6.53e-11 exp(sqrt): M=10000000 w=12 in 0.45 s:	267 Meval/s
+(ans=2.73717867952762e+19) Horner:    M=10000000 w=12 in 0.483 s:	248 Meval/s
+(ans=2.73717867952754e+19) rel err in sum = 3.01e-14
 
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <time.h>
+#include <vector>
 
 // Choose prec... (w=7 enough for single)
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta, const FLT c, const int w)
+static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta,
+                                          const FLT c, const int w)
 /* Evaluate kernel for a vector of w arguments, must also be the int width par.
    The #pragra's need to be removed for icpc if -fopenmp not used; g++ is ok.
  */
 {
 #pragma omp simd
-  for (int i = 0; i < w; i++)
-    ker[i] = exp(beta * sqrt((FLT)1.0 - c*args[i]*args[i]));
-  // gcc 5.4 can't simd the combined loop, hence we split the
-  // out-of-support test to subsequent loop...
-  // This check loop prevents getting 0.2s (600 Meval/s):
+  for (int i = 0; i < w; i++) ker[i] = exp(beta * sqrt((FLT)1.0 - c * args[i] * args[i]));
+    // gcc 5.4 can't simd the combined loop, hence we split the
+    // out-of-support test to subsequent loop...
+    // This check loop prevents getting 0.2s (600 Meval/s):
 #pragma omp simd
   for (int i = 0; i < w; i++)
-    if (fabs(args[i]) >= (FLT)w/2)    // note fabs not abs!
+    if (fabs(args[i]) >= (FLT)w / 2) // note fabs not abs!
       ker[i] = 0.0;
 }
 
@@ -78,79 +76,78 @@ static inline void kernel_vector_Horner(FLT *ker, FLT z, int w)
    See: gen_all_horner_C_code.m
 */
 {
-  //#include "../src/ker_horner_allw.c"
+  // #include "../src/ker_horner_allw.c"
 #include "../src/ker_horner_allw_loop.c"
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;          // # of reps (<2^31)
-  if (argc>1)
-    sscanf(argv[1],"%d",&M);  // weirdly allows exp simd 10x faster, even on gcc 5.4.0
-  int w=12;                   // spread width
-  if (argc>2)
-    sscanf(argv[2],"%d",&w);  // prevents the magic 0.2s, keeps at 0.4s
-  FLT beta=2.30*w;            // should match kernel params for acc test
-  if (w==2) beta = 2.20*w;
-  if (w==3) beta = 2.26*w;
-  if (w==4) beta = 2.38*w;
-  FLT c = 4.0/(FLT)(w*w);          // set up ker params for plain eval
-  FLT iw = 1.0/(FLT)w;        // scale factor
+int main(int argc, char *argv[]) {
+  int M = (int)1e7;            // # of reps (<2^31)
+  if (argc > 1)
+    sscanf(argv[1], "%d", &M); // weirdly allows exp simd 10x faster, even on gcc 5.4.0
+  int w = 12;                  // spread width
+  if (argc > 2) sscanf(argv[2], "%d", &w); // prevents the magic 0.2s, keeps at 0.4s
+  FLT beta = 2.30 * w;                     // should match kernel params for acc test
+  if (w == 2) beta = 2.20 * w;
+  if (w == 3) beta = 2.26 * w;
+  if (w == 4) beta = 2.38 * w;
+  FLT c  = 4.0 / (FLT)(w * w);               // set up ker params for plain eval
+  FLT iw = 1.0 / (FLT)w;                     // scale factor
   std::vector<FLT> x(w);
-  std::vector<FLT> f(16), f2(16); // length=MAX_NSPREAD
+  std::vector<FLT> f(16), f2(16);            // length=MAX_NSPREAD
 
-  int Macc = 100;        // test accuracy.......
+  int Macc   = 100;                          // test accuracy.......
   FLT superr = 0.0;
-  for (int i=0;i<Macc;++i) {       // loop over eval grid sets
-    FLT z = (2*i)/(FLT)(Macc-1)-1.0;  // local offset sweep through z in [-1,1]
-    //printf("z=%g:\n",z);   // useful for calling w/ eg Macc=3
-    kernel_vector_Horner(&f2[0],z,w);   // eval kernel to f2, given offset z
-    for (int j=0;j<w;++j)           // vector of args in [-w/2,w/2] ker supp
-      x[j] = (-(FLT)w+1.0+z)/2 + j;
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j) {
-      //printf("x=%.3g\tf=%.6g\tf2=%.6g\tf2-f=%.3g\n",x[j],f[j],f2[j],f2[j]-f[j]);
-      FLT err = abs(f[j]-f2[j]);
-      if (err>superr) superr = err;
+  for (int i = 0; i < Macc; ++i) {           // loop over eval grid sets
+    FLT z = (2 * i) / (FLT)(Macc - 1) - 1.0; // local offset sweep through z in [-1,1]
+    // printf("z=%g:\n",z);   // useful for calling w/ eg Macc=3
+    kernel_vector_Horner(&f2[0], z, w); // eval kernel to f2, given offset z
+    for (int j = 0; j < w; ++j)         // vector of args in [-w/2,w/2] ker supp
+      x[j] = (-(FLT)w + 1.0 + z) / 2 + j;
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) {
+      // printf("x=%.3g\tf=%.6g\tf2=%.6g\tf2-f=%.3g\n",x[j],f[j],f2[j],f2[j]-f[j]);
+      FLT err = abs(f[j] - f2[j]);
+      if (err > superr) superr = err;
     }
   }
   superr /= exp(beta);
-  printf("acc test: sup err scaled to kernel peak of 1: %.3g\n",superr);
-  
+  printf("acc test: sup err scaled to kernel peak of 1: %.3g\n", superr);
+
   // test speed...... plain eval
-  clock_t start=clock();
-  FLT ans = 0.0;                     // dummy answer
-  for (int i=0;i<M;++i) {            // loop over eval grid sets
-    FLT xi = -w/2.0 + i/(FLT)(M-1);  // offset in [-w/2, -w/2+1]
-    for (int j=0;j<w;++j)
-     x[j] = xi + (FLT)j;            // vector of args for [-w/2,w/2] ker supp
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j) {
+  clock_t start = clock();
+  FLT ans       = 0.0;                    // dummy answer
+  for (int i = 0; i < M; ++i) {           // loop over eval grid sets
+    FLT xi = -w / 2.0 + i / (FLT)(M - 1); // offset in [-w/2, -w/2+1]
+    for (int j = 0; j < w; ++j)
+      x[j] = xi + (FLT)j;                 // vector of args for [-w/2,w/2] ker supp
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) {
       // printf("x=%.16g\tf=%.16g\n",x[j],f[j]);
-      ans += f[j];                   // do something cheap to use all f outputs
+      ans += f[j]; // do something cheap to use all f outputs
     }
   }
-  double t=(double)(clock()-start)/CLOCKS_PER_SEC;
-  printf("exp(sqrt): M=%d w=%d in %.3g s:\t%.3g Meval/s (ans=%.15g)\n",M,w,t,M*w/(t*1.0e6),ans);
-  
+  double t = (double)(clock() - start) / CLOCKS_PER_SEC;
+  printf("exp(sqrt): M=%d w=%d in %.3g s:\t%.3g Meval/s (ans=%.15g)\n", M, w, t,
+         M * w / (t * 1.0e6), ans);
+
   // test speed...... Horner (on same set as above, so can check its sum)
-  start=clock();
-  FLT ans2 = 0.0;                    // dummy answer
-  for (int i=0;i<M;++i) {            // loop over eval grid sets
-    FLT z = (2*i)/(FLT)(M-1)-1.0;    // local offset sweep through z in [-1,1]
-    kernel_vector_Horner(&f[0],z,w); // eval kernel to f, given offset z
-    for (int j=0;j<w;++j)
-      ans2 += f[j];                  // do something cheap to use all f outputs
-    }
-  double t2=(double)(clock()-start)/CLOCKS_PER_SEC;
-  printf("Horner:    M=%d w=%d in %.3g s:\t%.3g Meval/s (ans=%.15g)\n",M,w,t2,M*w/(t2*1.0e6),ans2);
+  start    = clock();
+  FLT ans2 = 0.0;                             // dummy answer
+  for (int i = 0; i < M; ++i) {               // loop over eval grid sets
+    FLT z = (2 * i) / (FLT)(M - 1) - 1.0;     // local offset sweep through z in [-1,1]
+    kernel_vector_Horner(&f[0], z, w);        // eval kernel to f, given offset z
+    for (int j = 0; j < w; ++j) ans2 += f[j]; // do something cheap to use all f outputs
+  }
+  double t2 = (double)(clock() - start) / CLOCKS_PER_SEC;
+  printf("Horner:    M=%d w=%d in %.3g s:\t%.3g Meval/s (ans=%.15g)\n", M, w, t2,
+         M * w / (t2 * 1.0e6), ans2);
 
-  printf("rel err in sum = %.3g\n",fabs(ans-ans2)/fabs(ans));  // another acc
+  printf("rel err in sum = %.3g\n", fabs(ans - ans2) / fabs(ans)); // another acc
 
   // append timing data to tmp file...
-  FILE *p = fopen("/tmp/test_ker_ppval.dat","a");
-  fprintf(p,"%d %d %.3f %.3f %.3g\n",M,w,t,t2,superr);
+  FILE *p = fopen("/tmp/test_ker_ppval.dat", "a");
+  fprintf(p, "%d %d %.3f %.3f %.3g\n", M, w, t, t2, superr);
   fclose(p);
-  
+
   return 0;
 }
diff --git a/devel/time2d2interp.cpp b/devel/time2d2interp.cpp
index de1199a16..146c11808 100644
--- a/devel/time2d2interp.cpp
+++ b/devel/time2d2interp.cpp
@@ -1,5 +1,5 @@
 /* deterministic speed test for 2d2 interpolation, without wrapping effects.
-   
+
    g++ time2d2interp.cpp -o time2d2interp -Ofast; OMP_NUM_THREADS=1 ./time2d2interp
 
    If ns=10 statically defined in code:
@@ -8,7 +8,7 @@
    xeon gcc 7.3 -O3:                    0.88 s
    xeon gcc 7.3 -O2:                    1.2 s
    xeon gcc 7.3:                        8.6 s
-   
+
    if ns=10 read from argv:
    xeon gcc 7.3 -Ofast:                 1.0 s
    xeon gcc 7.3 -Ofast -march=native:   1.4 s
@@ -21,69 +21,69 @@
    Barnett 5/1/18
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <time.h>
+#include <vector>
 
 // Choose prec for floating pt...
 typedef double FLT;
 
 #define MAXNS 16
 
-int main(int argc, char *argv[])
-{
-  int M = 10000000;        // NU pts
-  int n = 2000;           // U grid pts per dimension (needn't be huge)
-  if (argc>1)
-    sscanf(argv[1],"%d",&M);
-  if (argc>2)
-    sscanf(argv[2],"%d",&n);
-  int ns=10;           // kernel width
-  if (argc>3)
-    sscanf(argv[3],"%d",&ns);
-  FLT ker1[MAXNS],ker2[MAXNS];
-
-  std::vector<FLT> du(2*n*n);      // U "input" array, with...
-  for (int i=0;i<2*n*n;++i)        // something in it
+int main(int argc, char *argv[]) {
+  int M = 10000000; // NU pts
+  int n = 2000;     // U grid pts per dimension (needn't be huge)
+  if (argc > 1) sscanf(argv[1], "%d", &M);
+  if (argc > 2) sscanf(argv[2], "%d", &n);
+  int ns = 10; // kernel width
+  if (argc > 3) sscanf(argv[3], "%d", &ns);
+  FLT ker1[MAXNS], ker2[MAXNS];
+
+  std::vector<FLT> du(2 * n * n);     // U "input" array, with...
+  for (int i = 0; i < 2 * n * n; ++i) // something in it
     du[i] = (FLT)i;
-  
-  clock_t start=clock();
-  FLT tot[2] = {0.0,0.0};    // complex output total
-  int N1=n, N2=n;
-  int i1=n/4, i2=n/4+7;      // starting pt for bottom left coords of interp box
-
-  for (int i=0;i<M;++i) {   // loop over NU pts ..............
-    for (int j=0;j<ns;++j) {  // some fixed 1d ker evals, dep on NU pt
-      ker1[j] = 1.0 - 0.1*(j-4.7)*(j-4.6) + ((FLT)i)*1e-7;;
-      ker2[j] = 0.7 - 0.04*(j-3.7)*(j-3.2) + ((FLT)i)*(-0.6e-7);
+
+  clock_t start = clock();
+  FLT tot[2]    = {0.0, 0.0};      // complex output total
+  int N1 = n, N2 = n;
+  int i1 = n / 4, i2 = n / 4 + 7;  // starting pt for bottom left coords of interp box
+
+  for (int i = 0; i < M; ++i) {    // loop over NU pts ..............
+    for (int j = 0; j < ns; ++j) { // some fixed 1d ker evals, dep on NU pt
+      ker1[j] = 1.0 - 0.1 * (j - 4.7) * (j - 4.6) + ((FLT)i) * 1e-7;
+      ;
+      ker2[j] = 0.7 - 0.04 * (j - 3.7) * (j - 3.2) + ((FLT)i) * (-0.6e-7);
     }
-    FLT out[2] = {0.0,0.0};                // re,im for result for each NU pt
+    FLT out[2] = {0.0, 0.0}; // re,im for result for each NU pt
 
     // core loop of interp_square... (no wrapping)
-    for (int dy=0; dy<ns; dy++) {
-      int j = N1*(i2+dy) + i1;
-      for (int dx=0; dx<ns; dx++) {
-	FLT k = ker1[dx]*ker2[dy];
-	out[0] += du[2*j] * k;
-	out[1] += du[2*j+1] * k;
-	++j;
+    for (int dy = 0; dy < ns; dy++) {
+      int j = N1 * (i2 + dy) + i1;
+      for (int dx = 0; dx < ns; dx++) {
+        FLT k = ker1[dx] * ker2[dy];
+        out[0] += du[2 * j] * k;
+        out[1] += du[2 * j + 1] * k;
+        ++j;
       }
     }
-    //printf("i=%d i1=%d i2=%d out=(%g,%g)\n",i,i1,i2,out[0],out[1]);
-    
-    tot[0]+=out[0];  // do something w/ answers
-    tot[1]+=out[1];
-    i1 += 1;         // slowly(!) advance the box corner up and across the grid
+    // printf("i=%d i1=%d i2=%d out=(%g,%g)\n",i,i1,i2,out[0],out[1]);
+
+    tot[0] += out[0]; // do something w/ answers
+    tot[1] += out[1];
+    i1 += 1;          // slowly(!) advance the box corner up and across the grid
     // (since N,M same order, sweeps O(1) times across the U grid, as bin sort)
-    if (i1>3*n/4) {i1-=n/2; i2+=1;}    // keep spread box away from edges
-    //i2 += 57;                // move far in slow direc - causes pain
-    if (i2>3*n/4) i2-=n/2;
-    
-  }                        // .......................
-  double t=(double)(clock()-start)/CLOCKS_PER_SEC;
-  printf("M=%d from N=%d^2, ns=%d: tot[0]=%.15g \t%.3g s\n",M,n,ns,tot[0],t);
-  printf("%.3g spread pts/s\n",M*ns*ns/t);
+    if (i1 > 3 * n / 4) {
+      i1 -= n / 2;
+      i2 += 1;
+    } // keep spread box away from edges
+    // i2 += 57;                // move far in slow direc - causes pain
+    if (i2 > 3 * n / 4) i2 -= n / 2;
+
+  } // .......................
+  double t = (double)(clock() - start) / CLOCKS_PER_SEC;
+  printf("M=%d from N=%d^2, ns=%d: tot[0]=%.15g \t%.3g s\n", M, n, ns, tot[0], t);
+  printf("%.3g spread pts/s\n", M * ns * ns / t);
   return 0;
 }
diff --git a/examples/cuda/example2d1many.cpp b/examples/cuda/example2d1many.cpp
index e67f8c30d..a5d0ecd5d 100644
--- a/examples/cuda/example2d1many.cpp
+++ b/examples/cuda/example2d1many.cpp
@@ -21,97 +21,101 @@ int main(int argc, char *argv[])
  * example code for 2D Type 1 transformation.
  *
  * To compile the code:
- * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include /loc/to/cufinufft/lib-static/libcufinufft.a
- * -lcudart -lcufft -lnvToolsExt
+ * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include
+ * /loc/to/cufinufft/lib-static/libcufinufft.a -lcudart -lcufft -lnvToolsExt
  *
  * or
  * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib
- * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include -L/loc/to/cufinufft/lib/ -lcufinufft
+ * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include
+ * -L/loc/to/cufinufft/lib/ -lcufinufft
  *
  *
  */
 {
-    std::cout << std::scientific << std::setprecision(3);
-
-    int ier;
-    int N1 = 256;
-    int N2 = 256;
-    int M = 65536;
-    int ntransf = 2;
-    int iflag = 1;
-    float tol = 1e-6;
-
-    float *x, *y;
-    std::complex<float> *c, *fk;
-    cudaMallocHost(&x, M * sizeof(float));
-    cudaMallocHost(&y, M * sizeof(float));
-    cudaMallocHost(&c, M * ntransf * sizeof(std::complex<float>));
-    cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex<float>));
-
-    float *d_x, *d_y;
-    cuFloatComplex *d_c, *d_fk;
-    cudaMalloc(&d_x, M * sizeof(float));
-    cudaMalloc(&d_y, M * sizeof(float));
-    cudaMalloc(&d_c, M * ntransf * sizeof(cuFloatComplex));
-    cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex));
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<float> distr(-1, 1);
-
-    for (int i = 0; i < M; i++) {
-        x[i] = M_PI * distr(eng);
-        y[i] = M_PI * distr(eng);
-    }
-
-    for (int i = 0; i < M * ntransf; i++) {
-        c[i].real(distr(eng));
-        c[i].imag(distr(eng));
-    }
-    cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_y, y, M * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_c, c, M * ntransf * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);
-
-    cufinufftf_plan dplan;
-
-    int dim = 2;
-    int64_t nmodes[3];
-    int type = 1;
-
-    nmodes[0] = N1;
-    nmodes[1] = N2;
-    nmodes[2] = 1;
-
-    ier = cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, NULL);
-
-    ier = cufinufftf_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
-
-    ier = cufinufftf_execute(dplan, d_c, d_fk);
-
-    ier = cufinufftf_destroy(dplan);
-
-    cudaMemcpy(fk, d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
-
-    std::cout << std::endl << "Accuracy check:" << std::endl;
-    int N = N1 * N2;
-    for (int i = 0; i < ntransf; i += 1) {
-        int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
-        std::complex<float> Ft = std::complex<float>(0, 0), J = std::complex<float>(0, 1) * (float)iflag;
-        for (CUFINUFFT_BIGINT j = 0; j < M; ++j)
-            Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
-        int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);                 // index in complex F as 1d array
-        printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2, abs(Ft - fk[it + i * N]));
-        printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2,
-               abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N));
-    }
-
-    cudaFreeHost(x);
-    cudaFreeHost(y);
-    cudaFreeHost(c);
-    cudaFreeHost(fk);
-
-    cudaFree(d_x);
-    cudaFree(d_y);
-    cudaFree(d_c);
-    cudaFree(d_fk);
-    return 0;
+  std::cout << std::scientific << std::setprecision(3);
+
+  int ier;
+  int N1      = 256;
+  int N2      = 256;
+  int M       = 65536;
+  int ntransf = 2;
+  int iflag   = 1;
+  float tol   = 1e-6;
+
+  float *x, *y;
+  std::complex<float> *c, *fk;
+  cudaMallocHost(&x, M * sizeof(float));
+  cudaMallocHost(&y, M * sizeof(float));
+  cudaMallocHost(&c, M * ntransf * sizeof(std::complex<float>));
+  cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex<float>));
+
+  float *d_x, *d_y;
+  cuFloatComplex *d_c, *d_fk;
+  cudaMalloc(&d_x, M * sizeof(float));
+  cudaMalloc(&d_y, M * sizeof(float));
+  cudaMalloc(&d_c, M * ntransf * sizeof(cuFloatComplex));
+  cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex));
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<float> distr(-1, 1);
+
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * distr(eng);
+    y[i] = M_PI * distr(eng);
+  }
+
+  for (int i = 0; i < M * ntransf; i++) {
+    c[i].real(distr(eng));
+    c[i].imag(distr(eng));
+  }
+  cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c, M * ntransf * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);
+
+  cufinufftf_plan dplan;
+
+  int dim = 2;
+  int64_t nmodes[3];
+  int type = 1;
+
+  nmodes[0] = N1;
+  nmodes[1] = N2;
+  nmodes[2] = 1;
+
+  ier = cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, NULL);
+
+  ier = cufinufftf_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
+
+  ier = cufinufftf_execute(dplan, d_c, d_fk);
+
+  ier = cufinufftf_destroy(dplan);
+
+  cudaMemcpy(fk, d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex),
+             cudaMemcpyDeviceToHost);
+
+  std::cout << std::endl << "Accuracy check:" << std::endl;
+  int N = N1 * N2;
+  for (int i = 0; i < ntransf; i += 1) {
+    int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
+    std::complex<float> Ft = std::complex<float>(0, 0),
+                        J  = std::complex<float>(0, 1) * (float)iflag;
+    for (CUFINUFFT_BIGINT j = 0; j < M; ++j)
+      Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+    int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+    printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2,
+           abs(Ft - fk[it + i * N]));
+    printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2,
+           abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N));
+  }
+
+  cudaFreeHost(x);
+  cudaFreeHost(y);
+  cudaFreeHost(c);
+  cudaFreeHost(fk);
+
+  cudaFree(d_x);
+  cudaFree(d_y);
+  cudaFree(d_c);
+  cudaFree(d_fk);
+  return 0;
 }
diff --git a/examples/cuda/example2d2many.cpp b/examples/cuda/example2d2many.cpp
index f35b10205..a6b0c6d3e 100644
--- a/examples/cuda/example2d2many.cpp
+++ b/examples/cuda/example2d2many.cpp
@@ -21,106 +21,109 @@ int main(int argc, char *argv[])
  * example code for 2D Type 1 transformation.
  *
  * To compile the code:
- * nvcc example2d2many.cpp -o example2d2many loc/to/cufinufft/lib-static/libcufinufft.a -I/loc/to/cufinufft/include
- * -lcudart -lcufft -lnvToolsExt
+ * nvcc example2d2many.cpp -o example2d2many loc/to/cufinufft/lib-static/libcufinufft.a
+ * -I/loc/to/cufinufft/include -lcudart -lcufft -lnvToolsExt
  *
  * or
  * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib
- * nvcc example2d2many.cpp -L/loc/to/cufinufft/lib/ -I/loc/to/cufinufft/include -o example2d1 -lcufinufft
+ * nvcc example2d2many.cpp -L/loc/to/cufinufft/lib/ -I/loc/to/cufinufft/include -o
+ * example2d1 -lcufinufft
  *
  *
  */
 {
-    std::cout << std::scientific << std::setprecision(3);
-
-    int ier;
-    int N1 = 128;
-    int N2 = 128;
-    int M = 10;
-    int ntransf = 4;
-    int maxbatchsize = 4;
-    int iflag = 1;
-    double tol = 1e-6;
-
-    double *x, *y;
-    std::complex<double> *c, *fk;
-    cudaMallocHost(&x, M * sizeof(double));
-    cudaMallocHost(&y, M * sizeof(double));
-    cudaMallocHost(&c, M * ntransf * sizeof(std::complex<double>));
-    cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex<double>));
-
-    double *d_x, *d_y;
-    cuDoubleComplex *d_c, *d_fk;
-    cudaMalloc(&d_x, M * sizeof(double));
-    cudaMalloc(&d_y, M * sizeof(double));
-    cudaMalloc(&d_c, M * ntransf * sizeof(cuDoubleComplex));
-    cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex));
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<double> distr(-1, 1);
-
-    for (int i = 0; i < M; i++) {
-        x[i] = M_PI * distr(eng);
-        y[i] = M_PI * distr(eng);
-    }
-
-    for (int i = 0; i < N1 * N2 * ntransf; i++) {
-        fk[i].real(distr(eng));
-        fk[i].imag(distr(eng));
-    }
-    cudaMemcpy(d_x, x, M * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_y, y, M * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_fk, fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
-
-    cufinufft_plan dplan;
-
-    int dim = 2;
-    int64_t nmodes[3];
-    int type = 2;
-
-    nmodes[0] = N1;
-    nmodes[1] = N2;
-    nmodes[2] = 1;
-
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-    opts.gpu_maxbatchsize = maxbatchsize;
-
-    ier = cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-
-    ier = cufinufft_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
-
-    ier = cufinufft_execute(dplan, d_c, d_fk);
-
-    ier = cufinufft_destroy(dplan);
-
-    cudaMemcpy(c, d_c, M * ntransf * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
-
-    std::cout << std::endl << "Accuracy check:" << std::endl;
-    std::complex<double> *fkstart;
-    std::complex<double> *cstart;
-    for (int t = 0; t < ntransf; t++) {
-        fkstart = fk + t * N1 * N2;
-        cstart = c + t * M;
-        int jt = M / 2; // check arbitrary choice of one targ pt
-        std::complex<double> J(0, iflag * 1);
-        std::complex<double> ct(0, 0);
-        int m = 0;
-        for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
-            for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-                ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
-
-        printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", t, jt, abs(cstart[jt] - ct) / infnorm(M, c));
-    }
-
-    cudaFreeHost(x);
-    cudaFreeHost(y);
-    cudaFreeHost(c);
-    cudaFreeHost(fk);
-
-    cudaFree(d_x);
-    cudaFree(d_y);
-    cudaFree(d_c);
-    cudaFree(d_fk);
-    return 0;
+  std::cout << std::scientific << std::setprecision(3);
+
+  int ier;
+  int N1           = 128;
+  int N2           = 128;
+  int M            = 10;
+  int ntransf      = 4;
+  int maxbatchsize = 4;
+  int iflag        = 1;
+  double tol       = 1e-6;
+
+  double *x, *y;
+  std::complex<double> *c, *fk;
+  cudaMallocHost(&x, M * sizeof(double));
+  cudaMallocHost(&y, M * sizeof(double));
+  cudaMallocHost(&c, M * ntransf * sizeof(std::complex<double>));
+  cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex<double>));
+
+  double *d_x, *d_y;
+  cuDoubleComplex *d_c, *d_fk;
+  cudaMalloc(&d_x, M * sizeof(double));
+  cudaMalloc(&d_y, M * sizeof(double));
+  cudaMalloc(&d_c, M * ntransf * sizeof(cuDoubleComplex));
+  cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex));
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<double> distr(-1, 1);
+
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * distr(eng);
+    y[i] = M_PI * distr(eng);
+  }
+
+  for (int i = 0; i < N1 * N2 * ntransf; i++) {
+    fk[i].real(distr(eng));
+    fk[i].imag(distr(eng));
+  }
+  cudaMemcpy(d_x, x, M * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, M * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_fk, fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex),
+             cudaMemcpyHostToDevice);
+
+  cufinufft_plan dplan;
+
+  int dim = 2;
+  int64_t nmodes[3];
+  int type = 2;
+
+  nmodes[0] = N1;
+  nmodes[1] = N2;
+  nmodes[2] = 1;
+
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+  opts.gpu_maxbatchsize = maxbatchsize;
+
+  ier = cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+
+  ier = cufinufft_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
+
+  ier = cufinufft_execute(dplan, d_c, d_fk);
+
+  ier = cufinufft_destroy(dplan);
+
+  cudaMemcpy(c, d_c, M * ntransf * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
+
+  std::cout << std::endl << "Accuracy check:" << std::endl;
+  std::complex<double> *fkstart;
+  std::complex<double> *cstart;
+  for (int t = 0; t < ntransf; t++) {
+    fkstart = fk + t * N1 * N2;
+    cstart  = c + t * M;
+    int jt  = M / 2; // check arbitrary choice of one targ pt
+    std::complex<double> J(0, iflag * 1);
+    std::complex<double> ct(0, 0);
+    int m = 0;
+    for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+      for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+        ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+
+    printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", t, jt,
+           abs(cstart[jt] - ct) / infnorm(M, c));
+  }
+
+  cudaFreeHost(x);
+  cudaFreeHost(y);
+  cudaFreeHost(c);
+  cudaFreeHost(fk);
+
+  cudaFree(d_x);
+  cudaFree(d_y);
+  cudaFree(d_c);
+  cudaFree(d_fk);
+  return 0;
 }
diff --git a/examples/cuda/getting_started.cpp b/examples/cuda/getting_started.cpp
index 113a73e7c..da2bf6f5f 100644
--- a/examples/cuda/getting_started.cpp
+++ b/examples/cuda/getting_started.cpp
@@ -26,90 +26,91 @@
 #include <stdlib.h>
 
 int main() {
-    // Problem size: number of nonuniform points (M) and grid size (N).
-    const int M = 100000, N = 10000;
+  // Problem size: number of nonuniform points (M) and grid size (N).
+  const int M = 100000, N = 10000;
 
-    // Size of the grid as an array.
-    int64_t modes[1] = {N};
+  // Size of the grid as an array.
+  int64_t modes[1] = {N};
 
-    // Host pointers: frequencies (x), coefficients (c), and output (f).
-    float *x;
-    float _Complex *c;
-    float _Complex *f;
+  // Host pointers: frequencies (x), coefficients (c), and output (f).
+  float *x;
+  float _Complex *c;
+  float _Complex *f;
 
-    // Device pointers.
-    float *d_x;
-    cuFloatComplex *d_c, *d_f;
+  // Device pointers.
+  float *d_x;
+  cuFloatComplex *d_c, *d_f;
 
-    // Store cufinufft plan.
-    cufinufftf_plan plan;
+  // Store cufinufft plan.
+  cufinufftf_plan plan;
 
-    // Manual calculation at a single point idx.
-    int idx;
-    float _Complex f0;
+  // Manual calculation at a single point idx.
+  int idx;
+  float _Complex f0;
 
-    // Allocate the host arrays.
-    x = (float *)malloc(M * sizeof(float));
-    c = (float _Complex *)malloc(M * sizeof(float _Complex));
-    f = (float _Complex *)malloc(N * sizeof(float _Complex));
+  // Allocate the host arrays.
+  x = (float *)malloc(M * sizeof(float));
+  c = (float _Complex *)malloc(M * sizeof(float _Complex));
+  f = (float _Complex *)malloc(N * sizeof(float _Complex));
 
-    // Fill with random numbers. Frequencies must be in the interval [-pi, pi)
-    // while strengths can be any value.
-    srand(0);
+  // Fill with random numbers. Frequencies must be in the interval [-pi, pi)
+  // while strengths can be any value.
+  srand(0);
 
-    for (int j = 0; j < M; ++j) {
-        x[j] = 2 * M_PI * (((float)rand()) / RAND_MAX - 1);
-        c[j] = (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
-    }
+  for (int j = 0; j < M; ++j) {
+    x[j] = 2 * M_PI * (((float)rand()) / RAND_MAX - 1);
+    c[j] =
+        (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
+  }
 
-    // Allocate the device arrays and copy the x and c arrays.
-    cudaMalloc(&d_x, M * sizeof(float));
-    cudaMalloc(&d_c, M * sizeof(float _Complex));
-    cudaMalloc(&d_f, N * sizeof(float _Complex));
+  // Allocate the device arrays and copy the x and c arrays.
+  cudaMalloc(&d_x, M * sizeof(float));
+  cudaMalloc(&d_c, M * sizeof(float _Complex));
+  cudaMalloc(&d_f, N * sizeof(float _Complex));
 
-    cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
 
-    // Make the cufinufft plan for a 1D type-1 transform with six digits of
-    // tolerance.
-    cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
+  // Make the cufinufft plan for a 1D type-1 transform with six digits of
+  // tolerance.
+  cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
 
-    // Set the frequencies of the nonuniform points.
-    cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
+  // Set the frequencies of the nonuniform points.
+  cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
 
-    // Actually execute the plan on the given coefficients and store the result
-    // in the d_f array.
-    cufinufftf_execute(plan, d_c, d_f);
+  // Actually execute the plan on the given coefficients and store the result
+  // in the d_f array.
+  cufinufftf_execute(plan, d_c, d_f);
 
-    // Copy the result back onto the host.
-    cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
+  // Copy the result back onto the host.
+  cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
 
-    // Destroy the plan and free the device arrays after we're done.
-    cufinufftf_destroy(plan);
+  // Destroy the plan and free the device arrays after we're done.
+  cufinufftf_destroy(plan);
 
-    cudaFree(d_x);
-    cudaFree(d_c);
-    cudaFree(d_f);
+  cudaFree(d_x);
+  cudaFree(d_c);
+  cudaFree(d_f);
 
-    // Pick an index to check the result of the calculation.
-    idx = 4 * N / 7;
+  // Pick an index to check the result of the calculation.
+  idx = 4 * N / 7;
 
-    printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
+  printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
 
-    // Calculate the result manually using the formula for the type-1
-    // transform.
-    f0 = 0;
+  // Calculate the result manually using the formula for the type-1
+  // transform.
+  f0 = 0;
 
-    for (int j = 0; j < M; ++j) {
-        f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
-    }
+  for (int j = 0; j < M; ++j) {
+    f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
+  }
 
-    printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
+  printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
 
-    // Finally free the host arrays.
-    free(x);
-    free(c);
-    free(f);
+  // Finally free the host arrays.
+  free(x);
+  free(c);
+  free(f);
 
-    return 0;
+  return 0;
 }
diff --git a/examples/guru1d1.cpp b/examples/guru1d1.cpp
index eb7189da0..35c626093 100644
--- a/examples/guru1d1.cpp
+++ b/examples/guru1d1.cpp
@@ -1,87 +1,90 @@
 // this is all you must include for the finufft lib...
-#include <finufft.h>
 #include <complex>
+#include <finufft.h>
 
 // specific to this example...
+#include <cassert>
 #include <math.h>
-#include <vector>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 
 // only good for small projects...
 using namespace std;
 // allows 1i to be the imaginary unit... (C++14 onwards)
 using namespace std::complex_literals;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example calling guru C++ interface to FINUFFT library, passing
    pointers to STL vectors of C++ double complex numbers, with a math check.
    Barnett 2/27/20
 
    Compile on linux with (or see ../makefile):
-   g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1  -lfftw3 -lfftw3_omp -lm
+   g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1
+   -lfftw3 -lfftw3_omp -lm
 
    Or if you have built a single-thread library, remove -fopenmp and -lfftw3_omp
 
    Usage: ./guru1d1
 */
 {
-  int M = 1e6;            // number of nonuniform points
-  int N = 1e6;            // number of modes
-  double tol = 1e-9;      // desired accuracy
+  int M      = 1e6;      // number of nonuniform points
+  int N      = 1e6;      // number of modes
+  double tol = 1e-9;     // desired accuracy
 
-  int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3];              // guru describes mode array by vector [N1,N2..]
-  Ns[0] = N;
-  int ntransf = 1;           // we want to do a single transform at a time
-  finufft_plan plan;         // creates a plan struct
-  int changeopts = 0;        // do you want to try changing opts? 0 or 1
-  if (changeopts) {          // demo how to change options away from defaults..
+  int type = 1, dim = 1; // 1d1
+  int64_t Ns[3];         // guru describes mode array by vector [N1,N2..]
+  Ns[0]       = N;
+  int ntransf = 1;       // we want to do a single transform at a time
+  finufft_plan plan;     // creates a plan struct
+  int changeopts = 0;    // do you want to try changing opts? 0 or 1
+  if (changeopts) {      // demo how to change options away from defaults..
     finufft_opts opts;
     finufft_default_opts(&opts);
-    opts.debug = 1;          // example options change
+    opts.debug = 1; // example options change
     finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, &opts);
-  } else                     // or, NULL here means use default opts...
+  } else            // or, NULL here means use default opts...
     finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL);
 
   // generate some random nonuniform points
   vector<double> x(M);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
+  for (int j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
   // note FINUFFT doesn't use std::vector types, so we need to make a pointer...
   finufft_setpts(plan, M, x.data(), NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate some complex strengths
   vector<complex<double>> c(M);
-  for (int j=0; j<M; ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
 
   // alloc output array for the Fourier modes, then do the transform
   vector<complex<double>> F(N);
   int ier = finufft_execute(plan, c.data(), F.data());
 
   // for fun, do another with same NU pts (no re-sorting), but new strengths...
-  for (int j=0; j<M; ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
   ier = finufft_execute(plan, c.data(), F.data());
 
-  finufft_destroy(plan);    // don't forget! done with transforms of this size
+  finufft_destroy(plan); // don't forget! done with transforms of this size
 
   // rest is math checking and reporting...
-  int n = 142519;   // check the answer just for this mode
-  assert(n>=-(double)N/2 && n<(double)N/2);     // ensure meaningful test
-  complex<double> Ftest = complex<double>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(1i*(double)n*x[j]);
-  int nout = n+N/2;        // index in output array for freq mode n
+  int n = 142519;                                   // check the answer just for this mode
+  assert(n >= -(double)N / 2 && n < (double)N / 2); // ensure meaningful test
+  complex<double> Ftest = complex<double>(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(1i * (double)n * x[j]);
+  int nout    = n + N / 2; // index in output array for freq mode n
   double Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  for (int m = 0; m < N; ++m) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  double err = abs(F[nout] - Ftest)/Fmax;
-  printf("guru 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,n,err);
+  double err = abs(F[nout] - Ftest) / Fmax;
+  printf("guru 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier,
+         n, err);
 
   return ier;
 }
diff --git a/examples/guru1d1f.cpp b/examples/guru1d1f.cpp
index a46c4a735..d890d3081 100644
--- a/examples/guru1d1f.cpp
+++ b/examples/guru1d1f.cpp
@@ -1,85 +1,88 @@
 // this is all you must include for the finufft lib...
-#include <finufft.h>
 #include <complex>
+#include <finufft.h>
 
 // specific to this example...
 #include <math.h>
-#include <vector>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
 // only good for small projects...
 using namespace std;
 // allows 1i to be the imaginary unit... (C++14 onwards)
 using namespace std::complex_literals;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example calling guru C++ interface to FINUFFT library, single-prec, passing
    pointers to STL vectors of C++ float complex numbers, with a math check.
    Barnett 7/5/20
 
    Compile on linux with:
-   g++-7 -std=c++14 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o guru1d1f  -lfftw3f -lfftw3f_omp -lm
+   g++-7 -std=c++14 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o
+   guru1d1f  -lfftw3f -lfftw3f_omp -lm
 
    Or if you have built a single-core library, remove -fopenmp and -lfftw3f_omp
 
    Usage: ./guru1d1f
 */
 {
-  int M = 1e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes
-  float tol = 1e-5;       // desired accuracy
+  int M     = 1e5;                // number of nonuniform points
+  int N     = 1e5;                // number of modes
+  float tol = 1e-5;               // desired accuracy
 
-  int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3];              // guru describes mode array by vector [N1,N2..]
-  Ns[0] = N;
-  int ntransf = 1;           // we want to do a single transform at a time
-  finufftf_plan plan;        // creates single-prec plan struct: note the "f"
-  int changeopts = 1;        // do you want to try changing opts? 0 or 1
-  if (changeopts) {          // demo how to change options away from defaults..
+  int type = 1, dim = 1;          // 1d1
+  int64_t Ns[3];                  // guru describes mode array by vector [N1,N2..]
+  Ns[0]       = N;
+  int ntransf = 1;                // we want to do a single transform at a time
+  finufftf_plan plan;             // creates single-prec plan struct: note the "f"
+  int changeopts = 1;             // do you want to try changing opts? 0 or 1
+  if (changeopts) {               // demo how to change options away from defaults..
     finufft_opts opts;
-    finufftf_default_opts(&opts);   // note "f" for single-prec, throughout...
-    opts.debug = 2;          // example options change
+    finufftf_default_opts(&opts); // note "f" for single-prec, throughout...
+    opts.debug = 2;               // example options change
     finufftf_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, &opts);
-  } else                     // or, NULL here means use default opts...
+  } else                          // or, NULL here means use default opts...
     finufftf_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL);
 
   // generate some random nonuniform points
   vector<float> x(M);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((float)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
+  for (int j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((float)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
   // note FINUFFT doesn't use std::vector types, so we need to make a pointer...
   finufftf_setpts(plan, M, &x[0], NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate some complex strengths
   vector<complex<float>> c(M);
-  for (int j=0; j<M; ++j)
-    c[j] = 2*((float)rand()/RAND_MAX)-1 + 1if*(2*((float)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j)
+    c[j] =
+        2 * ((float)rand() / RAND_MAX) - 1 + 1if * (2 * ((float)rand() / RAND_MAX) - 1);
 
   // alloc output array for the Fourier modes, then do the transform
   vector<complex<float>> F(N);
   int ier = finufftf_execute(plan, &c[0], &F[0]);
 
   // for fun, do another with same NU pts (no re-sorting), but new strengths...
-  for (int j=0; j<M; ++j)
-    c[j] = 2*((float)rand()/RAND_MAX)-1 + 1if*(2*((float)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j)
+    c[j] =
+        2 * ((float)rand() / RAND_MAX) - 1 + 1if * (2 * ((float)rand() / RAND_MAX) - 1);
   ier = finufftf_execute(plan, &c[0], &F[0]);
 
-  finufftf_destroy(plan);    // done with transforms of this size
+  finufftf_destroy(plan); // done with transforms of this size
 
   // rest is math checking and reporting...
-  int n = 12519;   // check the answer just for this mode, must be in [-N/2,N/2)
-  complex<float> Ftest = complex<float>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(1if*(float)n*x[j]);
-  int nout = n+N/2;        // index in output array for freq mode n
-  float Fmax = 0.0;        // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  int n = 12519; // check the answer just for this mode, must be in [-N/2,N/2)
+  complex<float> Ftest = complex<float>(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(1if * (float)n * x[j]);
+  int nout   = n + N / 2; // index in output array for freq mode n
+  float Fmax = 0.0;       // compute inf norm of F
+  for (int m = 0; m < N; ++m) {
     float aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  float err = abs(F[nout] - Ftest)/Fmax;
-  printf("guru 1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,n,err);
+  float err = abs(F[nout] - Ftest) / Fmax;
+  printf("guru 1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier,
+         n, err);
 
   return ier;
 }
diff --git a/examples/guru2d1.cpp b/examples/guru2d1.cpp
index 06d25e064..cfc39109e 100644
--- a/examples/guru2d1.cpp
+++ b/examples/guru2d1.cpp
@@ -1,51 +1,53 @@
 #include <finufft.h>
 
 #include <complex>
-#include <iostream>
 #include <iomanip>
+#include <iostream>
 #include <vector>
 using namespace std;
 
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[]) {
+
+  /* 2D type 1 guru interface example of calling the FINUFFT library from C++,
+     using STL double complex vectors, with a math test. Similar to simple2d1
+     except illustrates the guru interface.
 
-/* 2D type 1 guru interface example of calling the FINUFFT library from C++,
-   using STL double complex vectors, with a math test. Similar to simple2d1
-   except illustrates the guru interface.
+     Compile multithreaded with
+     g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3
+     -lfftw3_omp -lm single core with: g++ guru2d1.cpp -I ../src
+     ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lm
 
-   Compile multithreaded with
-   g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lfftw3_omp -lm
-   single core with:
-   g++ guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lm
-   
-   Usage:  ./guru2d1
-*/
-  int M = 1e6;                 // number of nonuniform points
-  int N = 1e6;                 // approximate total number of modes (N1*N2)
-  double tol = 1e-6;           // desired accuracy
-  finufft_opts opts; finufft_default_opts(&opts);
+     Usage:  ./guru2d1
+  */
+  int M      = 1e6;  // number of nonuniform points
+  int N      = 1e6;  // approximate total number of modes (N1*N2)
+  double tol = 1e-6; // desired accuracy
+  finufft_opts opts;
+  finufft_default_opts(&opts);
   opts.upsampfac = 1.25;
   complex<double> I(0.0, 1.0); // the imaginary unit
 
   // generate random non-uniform points on (x,y) and complex strengths (c):
   vector<double> x(M), y(M);
-  vector<complex<double> > c(M);
+  vector<complex<double>> c(M);
 
-  for(int i = 0; i < M; i++){
-    x[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi)
-    y[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi)
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi)
+    y[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi)
     // each component uniform random in [-1,1]
-    c[i] = 2*((double)rand()/RAND_MAX-1) + I*(2*((double)rand()/RAND_MAX)-1); 
+    c[i] =
+        2 * ((double)rand() / RAND_MAX - 1) + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
 
   // choose numbers of output Fourier coefficients in each dimension
-  int N1 = round(2.0*sqrt(N));
-  int N2 = round(N/N1);
-  
+  int N1 = round(2.0 * sqrt(N));
+  int N2 = round(N / N1);
+
   // output array for the Fourier modes
-  vector<complex<double> > F(N1*N2);
+  vector<complex<double>> F(N1 * N2);
 
-  int type=1, dim=2, ntrans=1;               // you could also do ntrans>1
-  int64_t Ns[] = {N1,N2};                    // N1,N2 as 64-bit int array
+  int type = 1, dim = 2, ntrans = 1; // you could also do ntrans>1
+  int64_t Ns[] = {N1, N2};           // N1,N2 as 64-bit int array
   // step 1: make a plan...
   finufft_plan plan;
   int ier = finufft_makeplan(type, dim, Ns, +1, ntrans, tol, &plan, NULL);
@@ -58,27 +60,28 @@ int main(int argc, char *argv[]){
   // step 4: free the memory used by the plan...
   finufft_destroy(plan);
 
-  int k1 = round(0.45*N1);    // check the answer for mode frequency (k1,k2)
-  int k2 = round(-0.35*N2);
-  
-  complex<double> Ftest(0,0);
-  for(int j = 0; j < M; j++)
-    Ftest += c[j]*exp(I*((double)k1*x[j]+(double)k2*y[j]));
+  int k1 = round(0.45 * N1); // check the answer for mode frequency (k1,k2)
+  int k2 = round(-0.35 * N2);
+
+  complex<double> Ftest(0, 0);
+  for (int j = 0; j < M; j++)
+    Ftest += c[j] * exp(I * ((double)k1 * x[j] + (double)k2 * y[j]));
 
-  // compute inf norm of F 
+  // compute inf norm of F
   double Fmax = 0.0;
-  for (int m=0; m<N1*N2; m++) {
+  for (int m = 0; m < N1 * N2; m++) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  
+
   // indices in output array for this frequency pair (k1,k2)
-  int k1out = k1 + (int)N1/2; 
-  int k2out = k2 + (int)N2/2;
-  int indexOut = k1out + k2out*(N1);
+  int k1out    = k1 + (int)N1 / 2;
+  int k2out    = k2 + (int)N2 / 2;
+  int indexOut = k1out + k2out * (N1);
 
   // compute relative error
-  double err = abs(F[indexOut] - Ftest)/Fmax;
-  cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut << "] rel to max(F) is " << setprecision(2) << err << endl;
+  double err = abs(F[indexOut] - Ftest) / Fmax;
+  cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut
+       << "] rel to max(F) is " << setprecision(2) << err << endl;
   return ier;
 }
diff --git a/examples/gurumany1d1.cpp b/examples/gurumany1d1.cpp
index 8f150e609..01503af14 100644
--- a/examples/gurumany1d1.cpp
+++ b/examples/gurumany1d1.cpp
@@ -1,4 +1,4 @@
-/* Demonstrate guru FINUFFT interface performing a stack of 1d type 1 
+/* Demonstrate guru FINUFFT interface performing a stack of 1d type 1
    transforms in a single execute call. See guru1d1.cpp for other guru
    features demonstrated.
 
@@ -11,70 +11,73 @@
 */
 
 // this is all you must include for the finufft lib...
-#include <finufft.h>
 #include <complex>
+#include <finufft.h>
 
 // specific to this demo...
+#include <cassert>
 #include <math.h>
-#include <vector>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 
 // only good for small projects...
 using namespace std;
 // allows 1i to be the imaginary unit... (C++14 onwards)
 using namespace std::complex_literals;
 
-int main(int argc, char* argv[])
-{
-  int M = 2e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes
-  double tol = 1e-9;      // desired accuracy
-  int ntrans = 100;       // request a bunch of transforms in the execute
-  int isign = +1;         // sign of i in the transform math definition
-  
+int main(int argc, char *argv[]) {
+  int M      = 2e5;          // number of nonuniform points
+  int N      = 1e5;          // number of modes
+  double tol = 1e-9;         // desired accuracy
+  int ntrans = 100;          // request a bunch of transforms in the execute
+  int isign  = +1;           // sign of i in the transform math definition
+
   int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3] = {N,0,0};   // guru describes mode array by vector [N1,N2..]
+  int64_t Ns[3] = {N, 0, 0}; // guru describes mode array by vector [N1,N2..]
   finufft_plan plan;         // creates a plan struct (NULL below: default opts)
   finufft_makeplan(type, dim, Ns, isign, ntrans, tol, &plan, NULL);
 
   // generate random nonuniform points and pass to FINUFFT
   vector<double> x(M);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
+  for (int j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
   finufft_setpts(plan, M, x.data(), NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate ntrans complex strength vectors each of length M (the slow bit!)
-  vector<complex<double>> c(M*ntrans);        // plain contiguous storage
-  for (int j=0; j<M*ntrans; ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+  vector<complex<double>> c(M * ntrans); // plain contiguous storage
+  for (int j = 0; j < M * ntrans; ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
 
   // alloc output array for the Fourier modes, then do the transform
-  vector<complex<double>> F(N*ntrans);
-  printf("guru many 1D type-1 double-prec, tol=%.3g, executing %d transforms (vectorized), each size %d NU pts to %d modes...\n",tol,ntrans,M,N);
+  vector<complex<double>> F(N * ntrans);
+  printf("guru many 1D type-1 double-prec, tol=%.3g, executing %d transforms "
+         "(vectorized), each size %d NU pts to %d modes...\n",
+         tol, ntrans, M, N);
   int ier = finufft_execute(plan, c.data(), F.data());
 
   // could now change c, do another execute, do another setpts, execute, etc...
-  
-  finufft_destroy(plan);  // don't forget! we're done with transforms of this size
-  
+
+  finufft_destroy(plan); // don't forget! we're done with transforms of this size
+
   // rest is math checking and reporting...
-  int k = 42519;   // check the answer just for this mode
-  int trans = 71;   // ...testing in just this transform
-  assert(k>=-(double)N/2 && k<(double)N/2);     // ensure meaningful test
-  assert(trans>=0 && trans<ntrans);
-  complex<double> Ftest = complex<double>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j+M*trans] * exp(1i*(double)k*x[j]);   // c offset to trans
-  double Fmax = 0.0;       // compute inf norm of F for selected transform
-  for (int m=0; m<N; ++m) {
-    double aF = abs(F[m+N*trans]);
-    if (aF>Fmax) Fmax=aF;
+  int k     = 42519;                                // check the answer just for this mode
+  int trans = 71;                                   // ...testing in just this transform
+  assert(k >= -(double)N / 2 && k < (double)N / 2); // ensure meaningful test
+  assert(trans >= 0 && trans < ntrans);
+  complex<double> Ftest = complex<double>(0, 0);
+  for (int j = 0; j < M; ++j)
+    Ftest += c[j + M * trans] * exp(1i * (double)k * x[j]); // c offset to trans
+  double Fmax = 0.0; // compute inf norm of F for selected transform
+  for (int m = 0; m < N; ++m) {
+    double aF = abs(F[m + N * trans]);
+    if (aF > Fmax) Fmax = aF;
   }
-  int nout = k+N/2 + N*trans;   // output index for freq mode k in the trans
-  double err = abs(F[nout] - Ftest)/Fmax;
-  printf("\tdone: ier=%d; for transform %d, rel err in F[%d] is %.3g\n",ier,trans,k,err);
+  int nout   = k + N / 2 + N * trans; // output index for freq mode k in the trans
+  double err = abs(F[nout] - Ftest) / Fmax;
+  printf("\tdone: ier=%d; for transform %d, rel err in F[%d] is %.3g\n", ier, trans, k,
+         err);
 
   return ier;
 }
diff --git a/examples/many1d1.cpp b/examples/many1d1.cpp
index 8176007c9..4b884d028 100644
--- a/examples/many1d1.cpp
+++ b/examples/many1d1.cpp
@@ -1,59 +1,61 @@
 #include <finufft.h>
 
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <cstdio>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example of calling the vectorized FINUFFT library from C++, using STL
    double complex vectors, with a math test.
 
    Compile with:
-   g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lfftw3_omp -lm
-   or if you have built a single-core version:
-   g++ many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lm
+   g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3
+   -lfftw3_omp -lm or if you have built a single-core version: g++ many1d1.cpp
+   -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lm
 
    Usage: ./many1d1
 */
 {
-  int ntrans = 3;         // how many stacked transforms to do
-  int M = 1e6;            // nonuniform points (same for all transforms)
-  int N = 1e6;            // number of modes (same for all transforms)
-  double tol = 1e-9;      // desired accuracy
-  finufft_opts* opts = new finufft_opts;     // opts is pointer to struct
+  int ntrans         = 3;                // how many stacked transforms to do
+  int M              = 1e6;              // nonuniform points (same for all transforms)
+  int N              = 1e6;              // number of modes (same for all transforms)
+  double tol         = 1e-9;             // desired accuracy
+  finufft_opts *opts = new finufft_opts; // opts is pointer to struct
   finufft_default_opts(opts);
-  complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
-  
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
+
   // generate some random nonuniform points (x) and complex strengths (c)...
   vector<double> x(M);
-  vector<complex<double> > c(M*ntrans);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-  for (int j=0; j<M*ntrans; ++j)                  // fill all ntrans vectors...
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  vector<complex<double>> c(M * ntrans);
+  for (int j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+  for (int j = 0; j < M * ntrans; ++j)                   // fill all ntrans vectors...
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   // allocate output array for the Fourier modes...
-  vector<complex<double> > F(N*ntrans);
+  vector<complex<double>> F(N * ntrans);
 
   // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
-  int ier = finufft1d1many(ntrans,M,&x[0],&c[0],+1,tol,N,&F[0],NULL);
-
-  int k = 142519;          // check the answer just for this mode...
-  int trans = ntrans-1;    // ...in this transform
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  
-  complex<double> Ftest = complex<double>(0,0);  // do the naive calc...
-  for (int j=0; j<M; ++j)
-    Ftest += c[j+M*trans] * exp(I*(double)k*x[j]);   // c from transform # trans
-  double Fmax = 0.0;       // compute inf norm of F for transform # trans
-  for (int m=0; m<N; ++m) {
-    double aF = abs(F[m+N*trans]);
-    if (aF>Fmax) Fmax=aF;
+  int ier = finufft1d1many(ntrans, M, &x[0], &c[0], +1, tol, N, &F[0], NULL);
+
+  int k     = 142519;     // check the answer just for this mode...
+  int trans = ntrans - 1; // ...in this transform
+  assert(k >= -(double)N / 2 && k < (double)N / 2);
+
+  complex<double> Ftest = complex<double>(0, 0);           // do the naive calc...
+  for (int j = 0; j < M; ++j)
+    Ftest += c[j + M * trans] * exp(I * (double)k * x[j]); // c from transform # trans
+  double Fmax = 0.0; // compute inf norm of F for transform # trans
+  for (int m = 0; m < N; ++m) {
+    double aF = abs(F[m + N * trans]);
+    if (aF > Fmax) Fmax = aF;
   }
-  int kout = k+N/2+N*trans;    // output index, freq mode k, transform # trans
-  double err = abs(F[kout] - Ftest)/Fmax;
-  printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err);
+  int kout   = k + N / 2 + N * trans; // output index, freq mode k, transform # trans
+  double err = abs(F[kout] - Ftest) / Fmax;
+  printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k,
+         err);
   return ier;
 }
diff --git a/examples/simple1d1.cpp b/examples/simple1d1.cpp
index 1e7f16858..4e547eafc 100644
--- a/examples/simple1d1.cpp
+++ b/examples/simple1d1.cpp
@@ -2,60 +2,61 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <cstdio>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example of calling the FINUFFT library from C++, using STL
    double complex vectors, with a math test.
    Double-precision version (see simple1d1f for single-precision)
 
    Compile with (static library case):
-   g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3 -lfftw3_omp
-   or if you have built a single-core version:
    g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3
+   -lfftw3_omp or if you have built a single-core version: g++ simple1d1.cpp -I../include
+   ../lib-static/libfinufft.a -o simple1d1 -lfftw3
 
    Usage: ./simple1d1
 
    Also see ../docs/cex.rst or online documentation.
 */
 {
-  int M = 1e6;            // number of nonuniform points
-  int N = 1e6;            // number of modes
-  double acc = 1e-9;      // desired accuracy
-  finufft_opts* opts = new finufft_opts;     // opts is pointer to struct
+  int M              = 1e6;                      // number of nonuniform points
+  int N              = 1e6;                      // number of modes
+  double acc         = 1e-9;                     // desired accuracy
+  finufft_opts *opts = new finufft_opts;         // opts is pointer to struct
   finufft_default_opts(opts);
-  complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
-  
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
+
   // generate some random nonuniform points (x) and complex strengths (c)...
   vector<double> x(M);
-  vector<complex<double> > c(M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  vector<complex<double>> c(M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
   // allocate output array for the Fourier modes...
-  vector<complex<double> > F(N);
-  
+  vector<complex<double>> F(N);
+
   // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
-  int ier = finufft1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts);
-
-  int k = 142519;   // check the answer just for this mode frequency...
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  complex<double> Ftest = complex<double>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(I*(double)k*x[j]);
-  double Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  int ier = finufft1d1(M, &x[0], &c[0], +1, acc, N, &F[0], opts);
+
+  int k = 142519; // check the answer just for this mode frequency...
+  assert(k >= -(double)N / 2 && k < (double)N / 2);
+  complex<double> Ftest = complex<double>(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (double)k * x[j]);
+  double Fmax = 0.0; // compute inf norm of F
+  for (int m = 0; m < N; ++m) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  int kout = k+N/2;        // index in output array for freq mode k
-  double err = abs(F[kout] - Ftest)/Fmax;
-  printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err);
+  int kout   = k + N / 2; // index in output array for freq mode k
+  double err = abs(F[kout] - Ftest) / Fmax;
+  printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k,
+         err);
   return ier;
 }
diff --git a/examples/simple1d1f.cpp b/examples/simple1d1f.cpp
index fea98b8d6..3882d8ea1 100644
--- a/examples/simple1d1f.cpp
+++ b/examples/simple1d1f.cpp
@@ -2,58 +2,58 @@
 #include <finufft.h>
 
 // also needed for this example...
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example of calling the FINUFFT library from C++, using STL
    single complex vectors, with a math test.
    (See simple1d1 for double-precision version.)
 
    Compile with:
-   g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lfftw3f_omp -lm
-   or if you have built a single-core version:
-   g++ simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lm
+   g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f
+   -lfftw3f -lfftw3f_omp -lm or if you have built a single-core version: g++
+   simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lm
 
    Usage: ./simple1d1f
 */
 {
-  int M = 1e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes (NB if too large lose acc in 1d)
-  float acc = 1e-3;       // desired accuracy
-  finufft_opts* opts = new finufft_opts;     // opts is pointer to struct
-  finufftf_default_opts(opts);   // note finufft "f" suffix
-  complex<float> I = complex<float>(0.0,1.0);  // the imaginary unit
-  
+  int M              = 1e5;  // number of nonuniform points
+  int N              = 1e5;  // number of modes (NB if too large lose acc in 1d)
+  float acc          = 1e-3; // desired accuracy
+  finufft_opts *opts = new finufft_opts;       // opts is pointer to struct
+  finufftf_default_opts(opts);                 // note finufft "f" suffix
+  complex<float> I = complex<float>(0.0, 1.0); // the imaginary unit
+
   // generate some random nonuniform points (x) and complex strengths (c)...
   vector<float> x(M);
-  vector<complex<float> > c(M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((float)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-    c[j] = 2*((float)rand()/RAND_MAX)-1 + I*(2*((float)rand()/RAND_MAX)-1);
+  vector<complex<float>> c(M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((float)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] = 2 * ((float)rand() / RAND_MAX) - 1 + I * (2 * ((float)rand() / RAND_MAX) - 1);
   }
   // allocate output array for the Fourier modes...
-  vector<complex<float> > F(N);
+  vector<complex<float>> F(N);
 
   // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
-  int ier = finufftf1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts);   // note "f"
+  int ier = finufftf1d1(M, &x[0], &c[0], +1, acc, N, &F[0], opts); // note "f"
 
-  int k = 14251;   // check the answer just for this mode...
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  complex<float> Ftest = complex<float>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(I*(float)k*x[j]);
-  float Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  int k = 14251; // check the answer just for this mode...
+  assert(k >= -(double)N / 2 && k < (double)N / 2);
+  complex<float> Ftest = complex<float>(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (float)k * x[j]);
+  float Fmax = 0.0; // compute inf norm of F
+  for (int m = 0; m < N; ++m) {
     float aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  int kout = k+N/2;       // index in output array for freq mode k
-  float err = abs(F[kout] - Ftest)/Fmax;
-  printf("1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err);
+  int kout  = k + N / 2; // index in output array for freq mode k
+  float err = abs(F[kout] - Ftest) / Fmax;
+  printf("1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k,
+         err);
   return ier;
 }
diff --git a/examples/simple2d1.cpp b/examples/simple2d1.cpp
index cf912445b..91cce0bd1 100644
--- a/examples/simple2d1.cpp
+++ b/examples/simple2d1.cpp
@@ -1,76 +1,79 @@
 // this is all you must include for the finufft lib...
-#include <finufft.h>
 #include <complex>
+#include <finufft.h>
 
 // also needed for this example...
-#include <iostream>
 #include <iomanip>
+#include <iostream>
 #include <vector>
 using namespace std;
 
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[]) {
 
-/* Simple 2D type-1 example of calling the FINUFFT library from C++, using plain
-   arrays of C++ complex numbers, with a math test. Double precision version. 
+  /* Simple 2D type-1 example of calling the FINUFFT library from C++, using plain
+     arrays of C++ complex numbers, with a math test. Double precision version.
 
-   Compile multithreaded with
-   g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lfftw3_omp -lm
-   single core with:
-   g++ simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lm
-   
-   Usage:  ./simple2d1
-*/
+     Compile multithreaded with
+     g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3
+     -lfftw3_omp -lm single core with: g++ simple2d1.cpp -I ../src
+     ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lm
 
-  int M = 1e6;                 // number of nonuniform points
-  int N = 1e6;                 // approximate total number of modes (N1*N2)
-  double tol = 1e-6;           // desired accuracy
-  finufft_opts opts; finufft_default_opts(&opts);
+     Usage:  ./simple2d1
+  */
+
+  int M      = 1e6;  // number of nonuniform points
+  int N      = 1e6;  // approximate total number of modes (N1*N2)
+  double tol = 1e-6; // desired accuracy
+  finufft_opts opts;
+  finufft_default_opts(&opts);
   complex<double> I(0.0, 1.0); // the imaginary unit
 
   // generate random non-uniform points on (x,y) and complex strengths (c):
   vector<double> x(M), y(M);
-  vector<complex<double> > c(M);
+  vector<complex<double>> c(M);
 
-  for(int i = 0; i < M; i++){
-    x[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi)
-    y[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi)
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi)
+    y[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi)
 
     // each component uniform random in [-1,1]
-    c[i] = 2*((double)rand()/RAND_MAX-1) + I*(2*((double)rand()/RAND_MAX)-1); 
+    c[i] =
+        2 * ((double)rand() / RAND_MAX - 1) + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
 
   // choose numbers of output Fourier coefficients in each dimension
-  int N1 = round(2.0*sqrt(N));
-  int N2 = round(N/N1);
-  
+  int N1 = round(2.0 * sqrt(N));
+  int N2 = round(N / N1);
+
   // output array for the Fourier modes
-  vector<complex<double> > F(N1*N2);
+  vector<complex<double>> F(N1 * N2);
 
   // call the NUFFT (with iflag += 1): note passing in pointers...
   opts.upsampfac = 1.25;
-  int ier = finufft2d1(M,&x[0],&y[0], &c[0], 1, tol, N1, N2, &F[0], &opts);
+  int ier        = finufft2d1(M, &x[0], &y[0], &c[0], 1, tol, N1, N2, &F[0], &opts);
 
-  int k1 = round(0.45*N1);    // check the answer for mode frequency (k1,k2)
-  int k2 = round(-0.35*N2);
-  
-  complex<double> Ftest(0,0);
-  for(int j = 0; j < M; j++)
-    Ftest += c[j]*exp(I*((double)k1*x[j]+(double)k2*y[j]));
+  int k1 = round(0.45 * N1); // check the answer for mode frequency (k1,k2)
+  int k2 = round(-0.35 * N2);
 
-  // compute inf norm of F 
+  complex<double> Ftest(0, 0);
+  for (int j = 0; j < M; j++)
+    Ftest += c[j] * exp(I * ((double)k1 * x[j] + (double)k2 * y[j]));
+
+  // compute inf norm of F
   double Fmax = 0.0;
-  for (int m=0; m<N1*N2; m++) {
+  for (int m = 0; m < N1 * N2; m++) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  
+
   // indices in output array for this frequency pair (k1,k2)
-  int k1out = k1 + N1/2; 
-  int k2out = k2 + N2/2;
-  int indexOut = k1out + k2out*(N1);
+  int k1out    = k1 + N1 / 2;
+  int k2out    = k2 + N2 / 2;
+  int indexOut = k1out + k2out * (N1);
 
   // compute relative error
-  double err = abs(F[indexOut] - Ftest)/Fmax;
-  cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut << "] rel to max(F) is " << setprecision(2) << err << endl;
+  double err = abs(F[indexOut] - Ftest) / Fmax;
+  cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut
+       << "] rel to max(F) is " << setprecision(2) << err << endl;
   return ier;
 }
diff --git a/examples/simulplans1d1.cpp b/examples/simulplans1d1.cpp
index b814876a2..4fb5f9449 100644
--- a/examples/simulplans1d1.cpp
+++ b/examples/simulplans1d1.cpp
@@ -2,37 +2,41 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <cstdio>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 using namespace std;
 
-void strengths(vector<complex<double>>& c) {    // fill random complex array
-  for (long unsigned int j=0; j<c.size(); ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+void strengths(vector<complex<double>> &c) { // fill random complex array
+  for (long unsigned int j = 0; j < c.size(); ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
 }
 
-double chk1d1(int n, vector<double>& x, vector<complex<double>>& c,
-              vector<complex<double>>& F)
+double chk1d1(int n, vector<double> &x, vector<complex<double>> &c,
+              vector<complex<double>> &F)
 // return error in output array F, for n'th mode only, rel to ||F||_inf
 {
   int N = F.size();
-  if (n>=N/2 || n<-N/2) { printf("n out of bounds!\n"); return NAN; }
-  complex<double> Ftest = complex<double>(0,0);
-  for (long unsigned int j=0; j<x.size(); ++j)
-    Ftest += c[j] * exp(1i*(double)n*x[j]);
-  int nout = n+N/2;        // index in output array for freq mode n
+  if (n >= N / 2 || n < -N / 2) {
+    printf("n out of bounds!\n");
+    return NAN;
+  }
+  complex<double> Ftest = complex<double>(0, 0);
+  for (long unsigned int j = 0; j < x.size(); ++j)
+    Ftest += c[j] * exp(1i * (double)n * x[j]);
+  int nout    = n + N / 2; // index in output array for freq mode n
   double Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  for (int m = 0; m < N; ++m) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  return abs(F[nout] - Ftest)/Fmax;
+  return abs(F[nout] - Ftest) / Fmax;
 }
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Demo two simultaneous FINUFFT plans (A,B) being handled in C++ without
    interacting (or at least without crashing; note that FFTW initialization
    is the only global state of FINUFFT library).
@@ -40,20 +44,21 @@ int main(int argc, char* argv[])
    Edited from guru1d1, Barnett 2/15/22
 
    Compile & run:
-   g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1 -lfftw3 -lfftw3_omp -lm && ./simulplans1d1
+   g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1
+   -lfftw3 -lfftw3_omp -lm && ./simulplans1d1
 */
 {
-  double tol = 1e-9;      // desired accuracy for both plans
+  double tol = 1e-9;         // desired accuracy for both plans
   int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3];           // guru describes mode array by vector [N1,N2..]
+  int64_t Ns[3];             // guru describes mode array by vector [N1,N2..]
   int ntransf = 1;           // we want to do a single transform at a time
-  
-  int MA = 3e6;            // number of nonuniform points    PLAN A
-  int NA = 1e6;            // number of modes
-  int MB = 2e6;            // number of nonuniform points    PLAN B, diff sizes
-  int NB = 1e5;            // number of modes
 
-  finufft_plan planA, planB;         // creates plan structs
+  int MA = 3e6;              // number of nonuniform points    PLAN A
+  int NA = 1e6;              // number of modes
+  int MB = 2e6;              // number of nonuniform points    PLAN B, diff sizes
+  int NB = 1e5;              // number of modes
+
+  finufft_plan planA, planB; // creates plan structs
   Ns[0] = NA;
   finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &planA, NULL);
   Ns[0] = NB;
@@ -61,22 +66,22 @@ int main(int argc, char* argv[])
 
   // generate some random nonuniform points
   vector<double> xA(MA), xB(MB);
-  for (int j=0; j<MA; ++j)
-    xA[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-  for (int j=0; j<MB; ++j)
-    xB[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-  
+  for (int j = 0; j < MA; ++j)
+    xA[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+  for (int j = 0; j < MB; ++j)
+    xB[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+
   // note FINUFFT doesn't use std::vector types, so we need to make a pointer...
   finufft_setpts(planA, MA, &xA[0], NULL, NULL, 0, NULL, NULL, NULL);
   finufft_setpts(planB, MB, &xB[0], NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate some complex strengths
   vector<complex<double>> cA(MA), cB(MB);
   strengths(cA);
   strengths(cB);
-  
+
   // allocate output arrays for the Fourier modes...
-  vector<complex<double> > FA(NA), FB(NB);
+  vector<complex<double>> FA(NA), FB(NB);
   int ierA = finufft_execute(planA, &cA[0], &FA[0]);
   int ierB = finufft_execute(planB, &cB[0], &FB[0]);
 
@@ -87,14 +92,16 @@ int main(int argc, char* argv[])
   ierB = finufft_execute(planB, &cB[0], &FB[0]);
   finufft_destroy(planA);
   finufft_destroy(planB);
-  
+
   // math checking and reporting...
-  int n = 116354;
-  double errA = chk1d1(n,xA,cA,FA);
-  printf("planA: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ierA,n,errA);
-  n = 27152;
-  double errB = chk1d1(n,xB,cB,FB);
-  printf("planB: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ierB,n,errB);
+  int n       = 116354;
+  double errA = chk1d1(n, xA, cA, FA);
+  printf("planA: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",
+         ierA, n, errA);
+  n           = 27152;
+  double errB = chk1d1(n, xB, cB, FB);
+  printf("planB: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",
+         ierB, n, errB);
 
   return ierA + ierB;
 }
diff --git a/examples/threadsafe1d1.cpp b/examples/threadsafe1d1.cpp
index f25f25b8b..da267fa6c 100644
--- a/examples/threadsafe1d1.cpp
+++ b/examples/threadsafe1d1.cpp
@@ -2,15 +2,15 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <cstdio>
-#include <stdlib.h>
-#include <cassert>
 #include <omp.h>
+#include <stdlib.h>
+#include <vector>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Demo single-threaded FINUFFT calls from inside a OMP parallel block.
    Adapted from simple1d1.cpp: C++, STL double complex vectors, with math test.
    Barnett 4/19/21, eg for Goran Zauhar, issue #183. Also see: many1d1.cpp.
@@ -26,50 +26,51 @@ int main(int argc, char* argv[])
    reporting small error.
 */
 {
-  int M = 1e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes
-  double acc = 1e-9;      // desired accuracy
-  finufft_opts* opts = new finufft_opts;     // opts is pointer to struct
+  int M              = 1e5;                      // number of nonuniform points
+  int N              = 1e5;                      // number of modes
+  double acc         = 1e-9;                     // desired accuracy
+  finufft_opts *opts = new finufft_opts;         // opts is pointer to struct
   finufft_default_opts(opts);
-  complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
-  
-  opts->nthreads=1;       // *crucial* so that each call single-thread (otherwise segfaults)
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
+
+  opts->nthreads = 1; // *crucial* so that each call single-thread (otherwise segfaults)
 
   // Now have each thread do independent 1D type 1 on their own data:
 #pragma omp parallel
   {
-  // generate some random nonuniform points (x) and complex strengths (c)...
-  // Note that these are local to the thread (if you have the *same* sets of
-  // NU pts x for each thread, consider instead using one vectorized multithreaded
-  // transform, which would be faster).
-  vector<double> x(M);
-  vector<complex<double> > c(M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
-  }
-    
-  // allocate output array for the Fourier modes... local to the thread
-  vector<complex<double> > F(N);
+    // generate some random nonuniform points (x) and complex strengths (c)...
+    // Note that these are local to the thread (if you have the *same* sets of
+    // NU pts x for each thread, consider instead using one vectorized multithreaded
+    // transform, which would be faster).
+    vector<double> x(M);
+    vector<complex<double>> c(M);
+    for (int j = 0; j < M; ++j) {
+      x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+      c[j] =
+          2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
+    }
 
-  // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
-  int ier = finufft1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts);
+    // allocate output array for the Fourier modes... local to the thread
+    vector<complex<double>> F(N);
 
-  int k = 42519;       // check the answer just for this mode frequency...
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  complex<double> Ftest = complex<double>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(I*(double)k*x[j]);
-  double Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
-    double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
-  }
-  int kout = k+N/2;        // index in output array for freq mode k
-  double err = abs(F[kout] - Ftest)/Fmax;
-  
-  printf("[thread %2d] 1D t-1 dbl-prec NUFFT done. ier=%d, rel err in F[%d]: %.3g\n",omp_get_thread_num(),ier,k,err);
+    // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
+    int ier = finufft1d1(M, &x[0], &c[0], +1, acc, N, &F[0], opts);
+
+    int k = 42519; // check the answer just for this mode frequency...
+    assert(k >= -(double)N / 2 && k < (double)N / 2);
+    complex<double> Ftest = complex<double>(0, 0);
+    for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (double)k * x[j]);
+    double Fmax = 0.0; // compute inf norm of F
+    for (int m = 0; m < N; ++m) {
+      double aF = abs(F[m]);
+      if (aF > Fmax) Fmax = aF;
+    }
+    int kout   = k + N / 2; // index in output array for freq mode k
+    double err = abs(F[kout] - Ftest) / Fmax;
+
+    printf("[thread %2d] 1D t-1 dbl-prec NUFFT done. ier=%d, rel err in F[%d]: %.3g\n",
+           omp_get_thread_num(), ier, k, err);
   }
-  
+
   return 0;
 }
diff --git a/examples/threadsafe2d2f.cpp b/examples/threadsafe2d2f.cpp
index 9844af54a..e2ad64bb1 100644
--- a/examples/threadsafe2d2f.cpp
+++ b/examples/threadsafe2d2f.cpp
@@ -8,7 +8,8 @@
 
    To compile (note uses threads rather than omp version of FFTW3):
 
-   g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -g -Wall
+   g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -g
+   -Wall
 
    ./threadsafe2d2f                                   <-- use all threads
    OMP_NUM_THREADS=1 ./threadsafe2d2f                 <-- sequential, 1 thread
@@ -23,43 +24,43 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
 #include <complex>
 #include <iostream>
 #include <omp.h>
+#include <vector>
 using namespace std;
 
-int test_finufft(finufft_opts* opts)
-  // self-contained small test that one single-prec FINUFFT2D2 has no error/crash
+int test_finufft(finufft_opts *opts)
+// self-contained small test that one single-prec FINUFFT2D2 has no error/crash
 {
-  size_t n_rows = 256, n_cols = 256;       // 2d image size
-  size_t n_read = 512, n_spokes = 128;     // some k-space point params
-  size_t M = n_read*n_spokes;              // how many k-space pts; MRI-specific
-  std::vector<float> x(M);                 // bunch of zero input data
+  size_t n_rows = 256, n_cols = 256;   // 2d image size
+  size_t n_read = 512, n_spokes = 128; // some k-space point params
+  size_t M = n_read * n_spokes;        // how many k-space pts; MRI-specific
+  std::vector<float> x(M);             // bunch of zero input data
   std::vector<float> y(M);
-  std::vector<std::complex<float>> img(n_rows * n_cols);    // coeffs
-  std::vector<std::complex<float>> ksp(M);     // output array (vals @ k-space pts)
+  std::vector<std::complex<float>> img(n_rows * n_cols); // coeffs
+  std::vector<std::complex<float>> ksp(M); // output array (vals @ k-space pts)
 
-  int ier = finufftf2d2(M, x.data(), y.data(), ksp.data(),
-                        -1, 1e-3, n_rows, n_cols, img.data(), opts);
+  int ier = finufftf2d2(M, x.data(), y.data(), ksp.data(), -1, 1e-3, n_rows, n_cols,
+                        img.data(), opts);
 
-  std::cout << "\ttest_finufft: exit code " << ier << ", thread " << omp_get_thread_num() << std::endl;
+  std::cout << "\ttest_finufft: exit code " << ier << ", thread " << omp_get_thread_num()
+            << std::endl;
   return ier;
 }
 
-int main(int argc, char* argv[])
-{
+int main(int argc, char *argv[]) {
   finufft_opts opts;
   finufftf_default_opts(&opts);
-  opts.nthreads = 1;     // *crucial* so each call single-thread; else segfaults
+  opts.nthreads = 1;      // *crucial* so each call single-thread; else segfaults
 
-  int n_slices = 50;     // number of transforms. parallelize over slices
-  int overallstatus=0;
+  int n_slices      = 50; // number of transforms. parallelize over slices
+  int overallstatus = 0;
 #pragma omp parallel for
   for (int i = 0; i < n_slices; i++) {
     int ier = test_finufft(&opts);
-    if (ier!=0) overallstatus=1;
+    if (ier != 0) overallstatus = 1;
   }
-  
+
   return overallstatus;
 }
diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp
index 9f415d647..799a10041 100644
--- a/fortran/finufftfort.cpp
+++ b/fortran/finufftfort.cpp
@@ -26,205 +26,182 @@
 // local prec-switching macros for fortran names, ie
 // underscore-suffixed versions of those at end of defs.h
 #define FINUFFT_DEFAULT_OPTS_ FINUFFTIFY(_default_opts_)
-#define FINUFFT_MAKEPLAN_ FINUFFTIFY(_makeplan_)
-#define FINUFFT_SETPTS_ FINUFFTIFY(_setpts_)
-#define FINUFFT_EXECUTE_ FINUFFTIFY(_execute_)
-#define FINUFFT_DESTROY_ FINUFFTIFY(_destroy_)
-#define FINUFFT1D1_ FINUFFTIFY(1d1_)
-#define FINUFFT1D2_ FINUFFTIFY(1d2_)
-#define FINUFFT1D3_ FINUFFTIFY(1d3_)
-#define FINUFFT2D1_ FINUFFTIFY(2d1_)
-#define FINUFFT2D2_ FINUFFTIFY(2d2_)
-#define FINUFFT2D3_ FINUFFTIFY(2d3_)
-#define FINUFFT3D1_ FINUFFTIFY(3d1_)
-#define FINUFFT3D2_ FINUFFTIFY(3d2_)
-#define FINUFFT3D3_ FINUFFTIFY(3d3_)
-#define FINUFFT1D1MANY_ FINUFFTIFY(1d1many_)
-#define FINUFFT1D2MANY_ FINUFFTIFY(1d2many_)
-#define FINUFFT1D3MANY_ FINUFFTIFY(1d3many_)
-#define FINUFFT2D1MANY_ FINUFFTIFY(2d1many_)
-#define FINUFFT2D2MANY_ FINUFFTIFY(2d2many_)
-#define FINUFFT2D3MANY_ FINUFFTIFY(2d3many_)
-#define FINUFFT3D1MANY_ FINUFFTIFY(3d1many_)
-#define FINUFFT3D2MANY_ FINUFFTIFY(3d2many_)
-#define FINUFFT3D3MANY_ FINUFFTIFY(3d3many_)
+#define FINUFFT_MAKEPLAN_     FINUFFTIFY(_makeplan_)
+#define FINUFFT_SETPTS_       FINUFFTIFY(_setpts_)
+#define FINUFFT_EXECUTE_      FINUFFTIFY(_execute_)
+#define FINUFFT_DESTROY_      FINUFFTIFY(_destroy_)
+#define FINUFFT1D1_           FINUFFTIFY(1d1_)
+#define FINUFFT1D2_           FINUFFTIFY(1d2_)
+#define FINUFFT1D3_           FINUFFTIFY(1d3_)
+#define FINUFFT2D1_           FINUFFTIFY(2d1_)
+#define FINUFFT2D2_           FINUFFTIFY(2d2_)
+#define FINUFFT2D3_           FINUFFTIFY(2d3_)
+#define FINUFFT3D1_           FINUFFTIFY(3d1_)
+#define FINUFFT3D2_           FINUFFTIFY(3d2_)
+#define FINUFFT3D3_           FINUFFTIFY(3d3_)
+#define FINUFFT1D1MANY_       FINUFFTIFY(1d1many_)
+#define FINUFFT1D2MANY_       FINUFFTIFY(1d2many_)
+#define FINUFFT1D3MANY_       FINUFFTIFY(1d3many_)
+#define FINUFFT2D1MANY_       FINUFFTIFY(2d1many_)
+#define FINUFFT2D2MANY_       FINUFFTIFY(2d2many_)
+#define FINUFFT2D3MANY_       FINUFFTIFY(2d3many_)
+#define FINUFFT3D1MANY_       FINUFFTIFY(3d1many_)
+#define FINUFFT3D2MANY_       FINUFFTIFY(3d2many_)
+#define FINUFFT3D3MANY_       FINUFFTIFY(3d3many_)
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-  
+
 // --------------------- guru interface from fortran ------------------------
-void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier)
-{
+void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf,
+                       FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier) {
   if (!plan)
-    fprintf(stderr,"%s fortran: plan must be allocated as at least the size of a C pointer (usually 8 bytes)!\n",__func__);
+    fprintf(stderr,
+            "%s fortran: plan must be allocated as at least the size of a C pointer "
+            "(usually 8 bytes)!\n",
+            __func__);
   else {
     // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts:
     *ier = FINUFFT_MAKEPLAN(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o);
   }
 }
 
-void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk, FLT *s, FLT *t, FLT *u, int *ier)
-{
+void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk,
+                     FLT *s, FLT *t, FLT *u, int *ier) {
   if (!*plan) {
-    fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__);
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
     return;
   }
-  int nk_safe = 0;           // catches the case where user passes NULL in
-  if (nk)
-    nk_safe = *nk;
+  int nk_safe = 0; // catches the case where user passes NULL in
+  if (nk) nk_safe = *nk;
   *ier = FINUFFT_SETPTS(*plan, *M, xj, yj, zj, nk_safe, s, t, u);
 }
 
-void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier)
-{
+void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier) {
   if (!plan)
-    fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__);
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
   else
     *ier = FINUFFT_EXECUTE(*plan, weights, result);
 }
 
-void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier)
-{
+void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier) {
   if (!plan)
-    fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__);
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
   else
     *ier = FINUFFT_DESTROY(*plan);
 }
 
-  
 // ------------ use FINUFFT to set the default options ---------------------
 // (Note the finufft_opts is created in f90-style derived types, not here)
-void FINUFFT_DEFAULT_OPTS_(finufft_opts* o)
-{
+void FINUFFT_DEFAULT_OPTS_(finufft_opts *o) {
   if (!o)
-    fprintf(stderr,"%s fortran: opts must be allocated!\n",__func__);
+    fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__);
   else
     // o is a ptr to already-allocated fortran finufft_opts derived type...
     FINUFFT_DEFAULT_OPTS(o);
 }
 
-
 // -------------- simple and many-vector interfaces --------------------
 // --- 1D ---
-void FINUFFT1D1_(BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D1(*nj,xj,cj,*iflag,*eps,*ms,fk,o);
+void FINUFFT1D1_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk,
+                 finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D1(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D1MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D1MANY(*ntransf,*nj,xj,cj,*iflag,*eps,*ms,fk,o);
+void FINUFFT1D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps,
+                     BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D1MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D2_(BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D2(*nj,xj,cj,*iflag,*eps,*ms,fk,o);
+void FINUFFT1D2_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk,
+                 finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D2(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D2MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D2MANY(*ntransf,*nj,xj,cj,*iflag,*eps,*ms,fk,o);
+void FINUFFT1D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps,
+                     BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D2MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D3_(BIGINT* nj, FLT* x, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D3(*nj,x,c,*iflag,*eps,*nk,s,f,o);
+void FINUFFT1D3_(BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, BIGINT *nk, FLT *s,
+                 CPX *f, finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D3(*nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
-void FINUFFT1D3MANY_(int* ntransf,
-                 BIGINT* nj, FLT* x, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D3MANY(*ntransf,*nj,x,c,*iflag,*eps,*nk,s,f,o);
+void FINUFFT1D3MANY_(int *ntransf, BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps,
+                     BIGINT *nk, FLT *s, CPX *f, finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D3MANY(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
 // --- 2D ---
-void FINUFFT2D1_(BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D1(*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o);
+void FINUFFT2D1_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms,
+                 BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT2D1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void FINUFFT2D1MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D1MANY(*ntransf,*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o);
+void FINUFFT2D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag,
+                     FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o,
+                     int *ier) {
+  *ier = FINUFFT2D1MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
 
-void FINUFFT2D2_(BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D2(*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o);
+void FINUFFT2D2_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms,
+                 BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT2D2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void FINUFFT2D2MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D2MANY(*ntransf,*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o);
+void FINUFFT2D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag,
+                     FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o,
+                     int *ier) {
+  *ier = FINUFFT2D2MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
 
-void FINUFFT2D3_(BIGINT* nj, FLT* x, FLT* y, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, FLT* t, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D3(*nj,x,y,c,*iflag,*eps,*nk,s,t,f,o);
+void FINUFFT2D3_(BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, FLT *eps, BIGINT *nk,
+                 FLT *s, FLT *t, CPX *f, finufft_opts *o, int *ier) {
+  *ier = FINUFFT2D3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
-void FINUFFT2D3MANY_(int* ntransf,
-                 BIGINT* nj, FLT* x, FLT* y, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, FLT* t, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D3MANY(*ntransf,*nj,x,y,c,*iflag,*eps,*nk,s,t,f,o);
+void FINUFFT2D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag,
+                     FLT *eps, BIGINT *nk, FLT *s, FLT *t, CPX *f, finufft_opts *o,
+                     int *ier) {
+  *ier = FINUFFT2D3MANY(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
 // --- 3D ---
-void FINUFFT3D1_(BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D1(*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o);
+void FINUFFT3D1_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps,
+                 BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT3D1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D1MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D1MANY(*ntransf,*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o);
+void FINUFFT3D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj,
+                     int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk,
+                     finufft_opts *o, int *ier) {
+  *ier =
+      FINUFFT3D1MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D2_(BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D2(*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o);
+void FINUFFT3D2_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps,
+                 BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT3D2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D2MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D2MANY(*ntransf,*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o);
+void FINUFFT3D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj,
+                     int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk,
+                     finufft_opts *o, int *ier) {
+  *ier =
+      FINUFFT3D2MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D3_(BIGINT* nj, FLT* x, FLT* y, FLT* z, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, FLT* t, FLT* u, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D3(*nj,x,y,z,c,*iflag,*eps,*nk,s,t,u,f,o);
+void FINUFFT3D3_(BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, FLT *eps,
+                 BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, finufft_opts *o, int *ier) {
+  *ier = FINUFFT3D3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
-void FINUFFT3D3MANY_(int* ntransf,
-                 BIGINT* nj, FLT* x, FLT* y, FLT* z, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, FLT* t, FLT* u, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D3MANY(*ntransf,*nj,x,y,z,c,*iflag,*eps,*nk,s,t,u,f,o);
+void FINUFFT3D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag,
+                     FLT *eps, BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f,
+                     finufft_opts *o, int *ier) {
+  *ier = FINUFFT3D3MANY(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
-  
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/cufinufft.h b/include/cufinufft.h
index 3c498fed0..b323d94c0 100644
--- a/include/cufinufft.h
+++ b/include/cufinufft.h
@@ -14,15 +14,15 @@ extern "C" {
 #endif
 void cufinufft_default_opts(cufinufft_opts *opts);
 
-int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, double eps,
-                       cufinufft_plan *d_plan_ptr, cufinufft_opts *opts);
-int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, float eps,
-                        cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
-
-int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s,
-                     double *d_t, double *d_u);
-int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s,
-                      float *d_t, float *d_u);
+int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr,
+                       double eps, cufinufft_plan *d_plan_ptr, cufinufft_opts *opts);
+int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr,
+                        float eps, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
+
+int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z,
+                     int N, double *d_s, double *d_t, double *d_u);
+int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z,
+                      int N, float *d_s, float *d_t, float *d_u);
 
 int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuDoubleComplex *d_fk);
 int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk);
diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
index 3ea437448..7bddc188e 100644
--- a/include/cufinufft/common.h
+++ b/include/cufinufft/common.h
@@ -10,24 +10,27 @@
 
 namespace cufinufft {
 namespace common {
-template <typename T>
-__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, cuDoubleComplex *a, T *fwkerhalf1,
-                                       T *fwkerhalf2, T *fwkerhalf3, int ns);
-template <typename T>
-int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleComplex *d_a, T *d_fwkerhalf1,
-                           T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream);
-template <typename T>
+template<typename T>
+__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
+                                       cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
+                                       T *fwkerhalf3, int ns);
+template<typename T>
+int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
+                           cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
+                           T *d_fwkerhalf3, int ns, cudaStream_t stream);
+template<typename T>
 int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts);
 
-void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, CUFINUFFT_BIGINT *nf,
-                   CUFINUFFT_BIGINT b);
-template <typename T>
+void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
+                   CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b);
+template<typename T>
 void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts);
-template <typename T>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, finufft_spread_opts opts);
-template <typename T>
-void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, T *fwkerhalf,
+template<typename T>
+void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
                                    finufft_spread_opts opts);
+template<typename T>
+void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+                                   T *fwkerhalf, finufft_spread_opts opts);
 
 } // namespace common
 } // namespace cufinufft
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
index 69dad3b86..3f3f931c6 100644
--- a/include/cufinufft/contrib/helper_cuda.h
+++ b/include/cufinufft/contrib/helper_cuda.h
@@ -37,95 +37,97 @@
 
 #include <cufft.h>
 
-static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); }
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
 
 // This will output the proper CUDA error strings in the event
 // that a CUDA host call returns an error
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
-#define RETURN_IF_CUDA_ERROR                                                                                           \
-    {                                                                                                                  \
-        cudaError_t err = cudaGetLastError();                                                                          \
-        if (err != cudaSuccess) {                                                                                      \
-            printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err));                                             \
-            return FINUFFT_ERR_CUDA_FAILURE;                                                                           \
-        }                                                                                                              \
-    }
-
-#define CUDA_FREE_AND_NULL(val, stream)                                                                                \
-    {                                                                                                                  \
-        if (val != nullptr) {                                                                                          \
-            check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__);                                               \
-            val = nullptr;                                                                                             \
-        }                                                                                                              \
-    }
+#define RETURN_IF_CUDA_ERROR                                         \
+  {                                                                  \
+    cudaError_t err = cudaGetLastError();                            \
+    if (err != cudaSuccess) {                                        \
+      printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
+      return FINUFFT_ERR_CUDA_FAILURE;                               \
+    }                                                                \
+  }
+
+#define CUDA_FREE_AND_NULL(val, stream)                            \
+  {                                                                \
+    if (val != nullptr) {                                          \
+      check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__); \
+      val = nullptr;                                               \
+    }                                                              \
+  }
 
 static const char *cufftGetErrorString(cufftResult error) {
-    switch (error) {
-    case CUFFT_SUCCESS:
-        return "CUFFT_SUCCESS";
+  switch (error) {
+  case CUFFT_SUCCESS:
+    return "CUFFT_SUCCESS";
 
-    case CUFFT_INVALID_PLAN:
-        return "CUFFT_INVALID_PLAN";
+  case CUFFT_INVALID_PLAN:
+    return "CUFFT_INVALID_PLAN";
 
-    case CUFFT_ALLOC_FAILED:
-        return "CUFFT_ALLOC_FAILED";
+  case CUFFT_ALLOC_FAILED:
+    return "CUFFT_ALLOC_FAILED";
 
-    case CUFFT_INVALID_TYPE:
-        return "CUFFT_INVALID_TYPE";
+  case CUFFT_INVALID_TYPE:
+    return "CUFFT_INVALID_TYPE";
 
-    case CUFFT_INVALID_VALUE:
-        return "CUFFT_INVALID_VALUE";
+  case CUFFT_INVALID_VALUE:
+    return "CUFFT_INVALID_VALUE";
 
-    case CUFFT_INTERNAL_ERROR:
-        return "CUFFT_INTERNAL_ERROR";
+  case CUFFT_INTERNAL_ERROR:
+    return "CUFFT_INTERNAL_ERROR";
 
-    case CUFFT_EXEC_FAILED:
-        return "CUFFT_EXEC_FAILED";
+  case CUFFT_EXEC_FAILED:
+    return "CUFFT_EXEC_FAILED";
 
-    case CUFFT_SETUP_FAILED:
-        return "CUFFT_SETUP_FAILED";
+  case CUFFT_SETUP_FAILED:
+    return "CUFFT_SETUP_FAILED";
 
-    case CUFFT_INVALID_SIZE:
-        return "CUFFT_INVALID_SIZE";
+  case CUFFT_INVALID_SIZE:
+    return "CUFFT_INVALID_SIZE";
 
-    case CUFFT_UNALIGNED_DATA:
-        return "CUFFT_UNALIGNED_DATA";
+  case CUFFT_UNALIGNED_DATA:
+    return "CUFFT_UNALIGNED_DATA";
 
-    case CUFFT_INCOMPLETE_PARAMETER_LIST:
-        return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+  case CUFFT_INCOMPLETE_PARAMETER_LIST:
+    return "CUFFT_INCOMPLETE_PARAMETER_LIST";
 
-    case CUFFT_INVALID_DEVICE:
-        return "CUFFT_INVALID_DEVICE";
+  case CUFFT_INVALID_DEVICE:
+    return "CUFFT_INVALID_DEVICE";
 
-    case CUFFT_PARSE_ERROR:
-        return "CUFFT_PARSE_ERROR";
+  case CUFFT_PARSE_ERROR:
+    return "CUFFT_PARSE_ERROR";
 
-    case CUFFT_NO_WORKSPACE:
-        return "CUFFT_NO_WORKSPACE";
+  case CUFFT_NO_WORKSPACE:
+    return "CUFFT_NO_WORKSPACE";
 
-    case CUFFT_NOT_IMPLEMENTED:
-        return "CUFFT_NOT_IMPLEMENTED";
+  case CUFFT_NOT_IMPLEMENTED:
+    return "CUFFT_NOT_IMPLEMENTED";
 
-    case CUFFT_LICENSE_ERROR:
-        return "CUFFT_LICENSE_ERROR";
+  case CUFFT_LICENSE_ERROR:
+    return "CUFFT_LICENSE_ERROR";
 
-    case CUFFT_NOT_SUPPORTED:
-        return "CUFFT_NOT_SUPPORTED";
-    }
+  case CUFFT_NOT_SUPPORTED:
+    return "CUFFT_NOT_SUPPORTED";
+  }
 
-    return "<unknown>";
+  return "<unknown>";
 }
 
-template <typename T>
+template<typename T>
 int check(T result, char const *const func, const char *const file, int const line) {
-    if (result) {
-        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result),
-                _cudaGetErrorEnum(result), func);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
 
-    return 0;
+  return 0;
 }
 
 #endif // COMMON_HELPER_CUDA_H_
diff --git a/include/cufinufft/cudeconvolve.h b/include/cufinufft/cudeconvolve.h
index 4daa4767e..ed701ed28 100644
--- a/include/cufinufft/cudeconvolve.h
+++ b/include/cufinufft/cudeconvolve.h
@@ -5,30 +5,33 @@
 
 namespace cufinufft {
 namespace deconvolve {
-template <typename T>
-__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1);
-template <typename T>
-__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf2);
-template <typename T>
-__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                              T *fwkerhalf1, T *fwkerhalf2);
-template <typename T>
-__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                           T *fwkerhalf1, T *fwkerhalf2);
+template<typename T>
+__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw,
+                              cuda_complex<T> *fk, T *fwkerhalf1);
+template<typename T>
+__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw,
+                           cuda_complex<T> *fk, T *fwkerhalf2);
+template<typename T>
+__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width,
+                              cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
+                              T *fwkerhalf2);
+template<typename T>
+__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width,
+                           cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
+                           T *fwkerhalf2);
 
-template <typename T>
-__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
-                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
-template <typename T>
-__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
-                           cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
+template<typename T>
+__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
+                              int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
+                              T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
+template<typename T>
+__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
+                           int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
+                           T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
 
-template <typename T>
-int cudeconvolve1d(cufinufft_plan_t<T> *d_mem, int blksize);
-template <typename T>
-int cudeconvolve2d(cufinufft_plan_t<T> *d_mem, int blksize);
-template <typename T>
-int cudeconvolve3d(cufinufft_plan_t<T> *d_mem, int blksize);
+template<typename T> int cudeconvolve1d(cufinufft_plan_t<T> *d_mem, int blksize);
+template<typename T> int cudeconvolve2d(cufinufft_plan_t<T> *d_mem, int blksize);
+template<typename T> int cudeconvolve3d(cufinufft_plan_t<T> *d_mem, int blksize);
 } // namespace deconvolve
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
index 6cdb84340..6b2a075ea 100644
--- a/include/cufinufft/defs.h
+++ b/include/cufinufft/defs.h
@@ -4,11 +4,12 @@
 #include <limits>
 
 // constants needed within common
-// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for common
+// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
+// common
 #define MAX_NSPREAD 16
 
 // max number of positive quadr nodes
-#define MAX_NQUAD 100
+#define MAX_NQUAD   100
 
 // FIXME: If cufft ever takes N > INT_MAX...
 constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
@@ -18,16 +19,16 @@ constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
 #ifdef _OPENMP
 #include <omp.h>
 // point to actual omp utils
-#define MY_OMP_GET_NUM_THREADS() omp_get_num_threads()
-#define MY_OMP_GET_MAX_THREADS() omp_get_max_threads()
-#define MY_OMP_GET_THREAD_NUM() omp_get_thread_num()
+#define MY_OMP_GET_NUM_THREADS()  omp_get_num_threads()
+#define MY_OMP_GET_MAX_THREADS()  omp_get_max_threads()
+#define MY_OMP_GET_THREAD_NUM()   omp_get_thread_num()
 #define MY_OMP_SET_NUM_THREADS(x) omp_set_num_threads(x)
-#define MY_OMP_SET_NESTED(x) omp_set_nested(x)
+#define MY_OMP_SET_NESTED(x)      omp_set_nested(x)
 #else
 // non-omp safe dummy versions of omp utils
 #define MY_OMP_GET_NUM_THREADS() 1
 #define MY_OMP_GET_MAX_THREADS() 1
-#define MY_OMP_GET_THREAD_NUM() 0
+#define MY_OMP_GET_THREAD_NUM()  0
 #define MY_OMP_SET_NUM_THREADS(x)
 #define MY_OMP_SET_NESTED(x)
 #endif
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 34b969b46..3b8d3db2c 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -16,255 +16,269 @@
 #include <finufft_errors.h>
 
 // 1d
-template <typename T>
-int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 
 // 2d
-template <typename T>
-int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 
 // 3d
-template <typename T>
-int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 
 static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) {
-    switch (dim) {
-    case 1: {
-        opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex;
-        opts->gpu_binsizey = 1;
-        opts->gpu_binsizez = 1;
-    } break;
+  switch (dim) {
+  case 1: {
+    opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex;
+    opts->gpu_binsizey = 1;
+    opts->gpu_binsizez = 1;
+  } break;
+  case 2: {
+    opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex;
+    opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey;
+    opts->gpu_binsizez = 1;
+  } break;
+  case 3: {
+    switch (opts->gpu_method) {
+    case 1:
     case 2: {
-        opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex;
-        opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey;
-        opts->gpu_binsizez = 1;
+      opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex;
+      opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey;
+      opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez;
     } break;
-    case 3: {
-        switch (opts->gpu_method) {
-        case 1:
-        case 2: {
-            opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex;
-            opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey;
-            opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez;
-        } break;
-        case 4: {
-            opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex;
-            opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey;
-            opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez;
-            opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex;
-            opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey;
-            opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez;
-        } break;
-        }
+    case 4: {
+      opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex;
+      opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey;
+      opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez;
+      opts->gpu_binsizex  = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex;
+      opts->gpu_binsizey  = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey;
+      opts->gpu_binsizez  = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez;
     } break;
     }
+  } break;
+  }
 }
 
-template <typename T>
+template<typename T>
 int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol,
                             cufinufft_plan_t<T> **d_plan_ptr, cufinufft_opts *opts) {
-    /*
-        "plan" stage (in single or double precision).
-            See ../docs/cppdoc.md for main user-facing documentation.
-            Note that *d_plan_ptr in the args list was called simply *plan there.
-            This is the remaining dev-facing doc:
-
-    This performs:
-            (0) creating a new plan struct (d_plan), a pointer to which is passed
-                back by writing that pointer into *d_plan_ptr.
-            (1) set up the spread option, d_plan.spopts.
-            (2) calculate the correction factor on cpu, copy the value from cpu to
-                gpu
-            (3) allocate gpu arrays with size determined by number of fourier modes
-                and method related options that had been set in d_plan.opts
-            (4) call cufftPlanMany and save the cufft plan inside cufinufft plan
-            Variables and arrays inside the plan struct are set and allocated.
-
-        Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
-    */
-    int ier;
-    cuDoubleComplex *d_a = nullptr; // fseries temp data
-    T *d_f = nullptr;               // fseries temp data
-
-    if (type < 1 || type > 2) {
-        fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
-        return FINUFFT_ERR_TYPE_NOTVALID;
-    }
-    if (ntransf < 1) {
-        fprintf(stderr, "[%s] Invalid ntransf (%d): should be at least 1.\n", __func__, ntransf);
-        return FINUFFT_ERR_NTRANS_NOTVALID;
-    }
-
-    // Mult-GPU support: set the CUDA Device ID:
-    const int device_id = opts == NULL ? 0 : opts->gpu_device_id;
-    cufinufft::utils::WithCudaDevice device_swapper(device_id);
-
-    /* allocate the plan structure, assign address to user pointer. */
-    cufinufft_plan_t<T> *d_plan = new cufinufft_plan_t<T>;
-    *d_plan_ptr = d_plan;
-    // Zero out your struct, (sets all pointers to NULL)
-    memset(d_plan, 0, sizeof(*d_plan));
-
-    /* If a user has not supplied their own options, assign defaults for them. */
-    if (opts == NULL) { // use default opts
-        cufinufft_default_opts(&(d_plan->opts));
-    } else {                  // or read from what's passed in
-        d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
-    }
-
-    auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
-
-    /* Automatically set GPU method. */
-    if (d_plan->opts.gpu_method == 0) {
-        /* For type 1, we default to method 2 (SM) since this is generally faster.
-         * However, in the special case of _double precision_ in _three dimensions_
-         * with more than _three digits of precision_, there is note enough shared
-         * memory for this to work. As a result, we will default to method 1 (GM) in
-         * this special case.
-         *
-         * For type 2, we always default to method 1 (GM). */
-        if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3))
-            d_plan->opts.gpu_method = 2;
-        else if (type == 1 && tol < 1e-3)
-            d_plan->opts.gpu_method = 1;
-        else if (type == 2)
-            d_plan->opts.gpu_method = 1;
-    }
-
-    /* Setup Spreader */
-    using namespace cufinufft::common;
-    // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
-    if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
-        delete *d_plan_ptr;
-        *d_plan_ptr = nullptr;
-        return ier;
-    }
-
-    d_plan->dim = dim;
-    d_plan->ms = nmodes[0];
-    d_plan->mt = nmodes[1];
-    d_plan->mu = nmodes[2];
-
-    cufinufft_setup_binsize(type, dim, &d_plan->opts);
-    CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
-    set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex);
+  /*
+      "plan" stage (in single or double precision).
+          See ../docs/cppdoc.md for main user-facing documentation.
+          Note that *d_plan_ptr in the args list was called simply *plan there.
+          This is the remaining dev-facing doc:
+
+  This performs:
+          (0) creating a new plan struct (d_plan), a pointer to which is passed
+              back by writing that pointer into *d_plan_ptr.
+          (1) set up the spread option, d_plan.spopts.
+          (2) calculate the correction factor on cpu, copy the value from cpu to
+              gpu
+          (3) allocate gpu arrays with size determined by number of fourier modes
+              and method related options that had been set in d_plan.opts
+          (4) call cufftPlanMany and save the cufft plan inside cufinufft plan
+          Variables and arrays inside the plan struct are set and allocated.
+
+      Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
+  */
+  int ier;
+  cuDoubleComplex *d_a = nullptr; // fseries temp data
+  T *d_f               = nullptr; // fseries temp data
+
+  if (type < 1 || type > 2) {
+    fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
+    return FINUFFT_ERR_TYPE_NOTVALID;
+  }
+  if (ntransf < 1) {
+    fprintf(stderr, "[%s] Invalid ntransf (%d): should be at least 1.\n", __func__,
+            ntransf);
+    return FINUFFT_ERR_NTRANS_NOTVALID;
+  }
+
+  // Mult-GPU support: set the CUDA Device ID:
+  const int device_id = opts == NULL ? 0 : opts->gpu_device_id;
+  cufinufft::utils::WithCudaDevice device_swapper(device_id);
+
+  /* allocate the plan structure, assign address to user pointer. */
+  cufinufft_plan_t<T> *d_plan = new cufinufft_plan_t<T>;
+  *d_plan_ptr                 = d_plan;
+  // Zero out your struct, (sets all pointers to NULL)
+  memset(d_plan, 0, sizeof(*d_plan));
+
+  /* If a user has not supplied their own options, assign defaults for them. */
+  if (opts == NULL) {     // use default opts
+    cufinufft_default_opts(&(d_plan->opts));
+  } else {                // or read from what's passed in
+    d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
+  }
+
+  auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
+
+  /* Automatically set GPU method. */
+  if (d_plan->opts.gpu_method == 0) {
+    /* For type 1, we default to method 2 (SM) since this is generally faster.
+     * However, in the special case of _double precision_ in _three dimensions_
+     * with more than _three digits of precision_, there is note enough shared
+     * memory for this to work. As a result, we will default to method 1 (GM) in
+     * this special case.
+     *
+     * For type 2, we always default to method 1 (GM). */
+    if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3))
+      d_plan->opts.gpu_method = 2;
+    else if (type == 1 && tol < 1e-3)
+      d_plan->opts.gpu_method = 1;
+    else if (type == 2)
+      d_plan->opts.gpu_method = 1;
+  }
+
+  /* Setup Spreader */
+  using namespace cufinufft::common;
+  // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
+  if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
+    delete *d_plan_ptr;
+    *d_plan_ptr = nullptr;
+    return ier;
+  }
+
+  d_plan->dim = dim;
+  d_plan->ms  = nmodes[0];
+  d_plan->mt  = nmodes[1];
+  d_plan->mu  = nmodes[2];
+
+  cufinufft_setup_binsize(type, dim, &d_plan->opts);
+  CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
+  set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
+                d_plan->opts.gpu_obinsizex);
+  if (dim > 1)
+    set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2,
+                  d_plan->opts.gpu_obinsizey);
+  if (dim > 2)
+    set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
+                  d_plan->opts.gpu_obinsizez);
+  int fftsign = (iflag >= 0) ? 1 : -1;
+
+  d_plan->nf1      = nf1;
+  d_plan->nf2      = nf2;
+  d_plan->nf3      = nf3;
+  d_plan->iflag    = fftsign;
+  d_plan->ntransf  = ntransf;
+  int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0;
+  if (maxbatchsize == 0)                 // implies: use a heuristic.
+    maxbatchsize = std::min(ntransf, 8); // heuristic from test codes
+  d_plan->maxbatchsize = maxbatchsize;
+  d_plan->type         = type;
+
+  if (d_plan->type == 1) d_plan->spopts.spread_direction = 1;
+  if (d_plan->type == 2) d_plan->spopts.spread_direction = 2;
+
+  using namespace cufinufft::memtransfer;
+  switch (d_plan->dim) {
+  case 1: {
+    if ((ier = allocgpumem1d_plan<T>(d_plan))) goto finalize;
+  } break;
+  case 2: {
+    if ((ier = allocgpumem2d_plan<T>(d_plan))) goto finalize;
+  } break;
+  case 3: {
+    if ((ier = allocgpumem3d_plan<T>(d_plan))) goto finalize;
+  } break;
+  }
+
+  cufftHandle fftplan;
+  cufftResult_t cufft_status;
+  switch (d_plan->dim) {
+  case 1: {
+    int n[]       = {(int)nf1};
+    int inembed[] = {(int)nf1};
+
+    cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
+                                 inembed[0], cufft_type<T>(), maxbatchsize);
+  } break;
+  case 2: {
+    int n[]       = {(int)nf2, (int)nf1};
+    int inembed[] = {(int)nf2, (int)nf1};
+
+    cufft_status =
+        cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
+                      inembed[0] * inembed[1], cufft_type<T>(), maxbatchsize);
+  } break;
+  case 3: {
+    int n[]       = {(int)nf3, (int)nf2, (int)nf1};
+    int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
+
+    cufft_status = cufftPlanMany(
+        &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
+        inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), maxbatchsize);
+  } break;
+  }
+
+  if (cufft_status != CUFFT_SUCCESS) {
+    fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
+            cufftGetErrorString(cufft_status));
+    ier = FINUFFT_ERR_CUDA_FAILURE;
+    goto finalize;
+  }
+  cufftSetStream(fftplan, stream);
+
+  d_plan->fftplan = fftplan;
+  {
+    std::complex<double> *a = d_plan->fseries_precomp_a;
+    T *f                    = d_plan->fseries_precomp_f;
+
+    onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
     if (dim > 1)
-        set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2, d_plan->opts.gpu_obinsizey);
+      onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
     if (dim > 2)
-        set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, d_plan->opts.gpu_obinsizez);
-    int fftsign = (iflag >= 0) ? 1 : -1;
-
-    d_plan->nf1 = nf1;
-    d_plan->nf2 = nf2;
-    d_plan->nf3 = nf3;
-    d_plan->iflag = fftsign;
-    d_plan->ntransf = ntransf;
-    int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0;
-    if (maxbatchsize == 0)                   // implies: use a heuristic.
-        maxbatchsize = std::min(ntransf, 8); // heuristic from test codes
-    d_plan->maxbatchsize = maxbatchsize;
-    d_plan->type = type;
-
-    if (d_plan->type == 1)
-        d_plan->spopts.spread_direction = 1;
-    if (d_plan->type == 2)
-        d_plan->spopts.spread_direction = 2;
-
-    using namespace cufinufft::memtransfer;
-    switch (d_plan->dim) {
-    case 1: {
-        if ((ier = allocgpumem1d_plan<T>(d_plan)))
-            goto finalize;
-    } break;
-    case 2: {
-        if ((ier = allocgpumem2d_plan<T>(d_plan)))
-            goto finalize;
-    } break;
-    case 3: {
-        if ((ier = allocgpumem3d_plan<T>(d_plan)))
-            goto finalize;
-    } break;
-    }
-
-    cufftHandle fftplan;
-    cufftResult_t cufft_status;
-    switch (d_plan->dim) {
-    case 1: {
-        int n[] = {(int)nf1};
-        int inembed[] = {(int)nf1};
-
-        cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, inembed[0], cufft_type<T>(),
-                                     maxbatchsize);
-    } break;
-    case 2: {
-        int n[] = {(int)nf2, (int)nf1};
-        int inembed[] = {(int)nf2, (int)nf1};
-
-        cufft_status = cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
-                                     inembed[0] * inembed[1], cufft_type<T>(), maxbatchsize);
-    } break;
-    case 3: {
-        int n[] = {(int)nf3, (int)nf2, (int)nf1};
-        int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
-
-        cufft_status = cufftPlanMany(&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
-                                     inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), maxbatchsize);
-    } break;
-    }
-
-    if (cufft_status != CUFFT_SUCCESS) {
-        fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, cufftGetErrorString(cufft_status));
-        ier = FINUFFT_ERR_CUDA_FAILURE;
-        goto finalize;
-    }
-    cufftSetStream(fftplan, stream);
-
-    d_plan->fftplan = fftplan;
-    {
-        std::complex<double> *a = d_plan->fseries_precomp_a;
-        T *f = d_plan->fseries_precomp_f;
-
-        onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
-        if (dim > 1)
-            onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
-        if (dim > 2)
-            onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, d_plan->spopts);
-
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(
-                 cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream))))
-            goto finalize;
-        if ((ier =
-                 checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice, stream))))
-            goto finalize;
-        if ((ier = cufserieskernelcompute(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
-                                          d_plan->fwkerhalf3, d_plan->spopts.nspread, stream)))
-            goto finalize;
-    }
+      onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD,
+                                    d_plan->spopts);
+
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
+                             cudaMemcpyHostToDevice, stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T),
+                                               cudaMemcpyHostToDevice, stream))))
+      goto finalize;
+    if ((ier = cufserieskernelcompute(
+             d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
+             d_plan->fwkerhalf3, d_plan->spopts.nspread, stream)))
+      goto finalize;
+  }
 
 finalize:
-    cudaFreeAsync(d_a, stream);
-    cudaFreeAsync(d_f, stream);
+  cudaFreeAsync(d_a, stream);
+  cudaFreeAsync(d_f, stream);
 
-    if (ier > 1) {
-        delete *d_plan_ptr;
-        *d_plan_ptr = nullptr;
-    }
+  if (ier > 1) {
+    delete *d_plan_ptr;
+    *d_plan_ptr = nullptr;
+  }
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
-int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u,
+                          cufinufft_plan_t<T> *d_plan)
 /*
     "setNUpts" stage (in single or double precision).
 
@@ -302,66 +316,78 @@ Notes: the type T means either single or double, matching the
     Melody Shih 07/25/19; Barnett 2/16/21 moved out docs.
 */
 {
-    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int dim = d_plan->dim;
-
-    d_plan->M = M;
-
-    using namespace cufinufft::memtransfer;
-    int ier;
-    switch (d_plan->dim) {
-    case 1: {
-        ier = allocgpumem1d_nupts<T>(d_plan);
-    } break;
-    case 2: {
-        ier = allocgpumem2d_nupts<T>(d_plan);
-    } break;
-    case 3: {
-        ier = allocgpumem3d_nupts<T>(d_plan);
-    } break;
-    }
-    if (ier)
-        return ier;
-
-    d_plan->kx = d_kx;
-    if (dim > 1)
-        d_plan->ky = d_ky;
-    if (dim > 2)
-        d_plan->kz = d_kz;
-
-    using namespace cufinufft::spreadinterp;
-    switch (d_plan->dim) {
-    case 1: {
-        if (d_plan->opts.gpu_method == 1 && (ier = cuspread1d_nuptsdriven_prop<T>(nf1, M, d_plan)))
-            fprintf(stderr, "error: cuspread1d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
-        if (d_plan->opts.gpu_method == 2 && (ier = cuspread1d_subprob_prop<T>(nf1, M, d_plan)))
-            fprintf(stderr, "error: cuspread1d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-    } break;
-    case 2: {
-        if (d_plan->opts.gpu_method == 1 && (ier = cuspread2d_nuptsdriven_prop<T>(nf1, nf2, M, d_plan)))
-            fprintf(stderr, "error: cuspread2d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
-        if (d_plan->opts.gpu_method == 2 && (ier = cuspread2d_subprob_prop<T>(nf1, nf2, M, d_plan)))
-            fprintf(stderr, "error: cuspread2d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-    } break;
-    case 3: {
-        if (d_plan->opts.gpu_method == 1 && (ier = cuspread3d_nuptsdriven_prop<T>(nf1, nf2, nf3, M, d_plan)))
-            fprintf(stderr, "error: cuspread3d_nuptsdriven_prop, method(%d)\n", d_plan->opts.gpu_method);
-        if (d_plan->opts.gpu_method == 2 && (ier = cuspread3d_subprob_prop<T>(nf1, nf2, nf3, M, d_plan)))
-            fprintf(stderr, "error: cuspread3d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-        if (d_plan->opts.gpu_method == 4 && (ier = cuspread3d_blockgather_prop<T>(nf1, nf2, nf3, M, d_plan)))
-            fprintf(stderr, "error: cuspread3d_blockgather_prop, method(%d)\n", d_plan->opts.gpu_method);
-    } break;
-    }
-
-    return ier;
+  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int nf3 = d_plan->nf3;
+  int dim = d_plan->dim;
+
+  d_plan->M = M;
+
+  using namespace cufinufft::memtransfer;
+  int ier;
+  switch (d_plan->dim) {
+  case 1: {
+    ier = allocgpumem1d_nupts<T>(d_plan);
+  } break;
+  case 2: {
+    ier = allocgpumem2d_nupts<T>(d_plan);
+  } break;
+  case 3: {
+    ier = allocgpumem3d_nupts<T>(d_plan);
+  } break;
+  }
+  if (ier) return ier;
+
+  d_plan->kx = d_kx;
+  if (dim > 1) d_plan->ky = d_ky;
+  if (dim > 2) d_plan->kz = d_kz;
+
+  using namespace cufinufft::spreadinterp;
+  switch (d_plan->dim) {
+  case 1: {
+    if (d_plan->opts.gpu_method == 1 &&
+        (ier = cuspread1d_nuptsdriven_prop<T>(nf1, M, d_plan)))
+      fprintf(stderr, "error: cuspread1d_nupts_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+    if (d_plan->opts.gpu_method == 2 &&
+        (ier = cuspread1d_subprob_prop<T>(nf1, M, d_plan)))
+      fprintf(stderr, "error: cuspread1d_subprob_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+  } break;
+  case 2: {
+    if (d_plan->opts.gpu_method == 1 &&
+        (ier = cuspread2d_nuptsdriven_prop<T>(nf1, nf2, M, d_plan)))
+      fprintf(stderr, "error: cuspread2d_nupts_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+    if (d_plan->opts.gpu_method == 2 &&
+        (ier = cuspread2d_subprob_prop<T>(nf1, nf2, M, d_plan)))
+      fprintf(stderr, "error: cuspread2d_subprob_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+  } break;
+  case 3: {
+    if (d_plan->opts.gpu_method == 1 &&
+        (ier = cuspread3d_nuptsdriven_prop<T>(nf1, nf2, nf3, M, d_plan)))
+      fprintf(stderr, "error: cuspread3d_nuptsdriven_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+    if (d_plan->opts.gpu_method == 2 &&
+        (ier = cuspread3d_subprob_prop<T>(nf1, nf2, nf3, M, d_plan)))
+      fprintf(stderr, "error: cuspread3d_subprob_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+    if (d_plan->opts.gpu_method == 4 &&
+        (ier = cuspread3d_blockgather_prop<T>(nf1, nf2, nf3, M, d_plan)))
+      fprintf(stderr, "error: cuspread3d_blockgather_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+  } break;
+  }
+
+  return ier;
 }
 
-template <typename T>
-int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                           cufinufft_plan_t<T> *d_plan)
 /*
     "exec" stage (single and double precision versions).
 
@@ -377,53 +403,47 @@ int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinuff
           Type 2; output for Type 1)
 
     Notes:
-        i) Here CUFINUFFT_CPX is a defined type meaning either complex<float> or complex<double>
-        to match the precision of the library called.
-        ii) All operations are done on the GPU device (hence the d_* names)
+        i) Here CUFINUFFT_CPX is a defined type meaning either complex<float> or
+   complex<double> to match the precision of the library called. ii) All operations are
+   done on the GPU device (hence the d_* names)
 
     Melody Shih 07/25/19; Barnett 2/16/21.
 */
 {
-    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    int ier;
-    int type = d_plan->type;
-    switch (d_plan->dim) {
-    case 1: {
-        if (type == 1)
-            ier = cufinufft1d1_exec<T>(d_c, d_fk, d_plan);
-        if (type == 2)
-            ier = cufinufft1d2_exec<T>(d_c, d_fk, d_plan);
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = FINUFFT_ERR_TYPE_NOTVALID;
-        }
-    } break;
-    case 2: {
-        if (type == 1)
-            ier = cufinufft2d1_exec<T>(d_c, d_fk, d_plan);
-        if (type == 2)
-            ier = cufinufft2d2_exec<T>(d_c, d_fk, d_plan);
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = FINUFFT_ERR_TYPE_NOTVALID;
-        }
-    } break;
-    case 3: {
-        if (type == 1)
-            ier = cufinufft3d1_exec<T>(d_c, d_fk, d_plan);
-        if (type == 2)
-            ier = cufinufft3d2_exec<T>(d_c, d_fk, d_plan);
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = FINUFFT_ERR_TYPE_NOTVALID;
-        }
-    } break;
+  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  int ier;
+  int type = d_plan->type;
+  switch (d_plan->dim) {
+  case 1: {
+    if (type == 1) ier = cufinufft1d1_exec<T>(d_c, d_fk, d_plan);
+    if (type == 2) ier = cufinufft1d2_exec<T>(d_c, d_fk, d_plan);
+    if (type == 3) {
+      std::cerr << "Not Implemented yet" << std::endl;
+      ier = FINUFFT_ERR_TYPE_NOTVALID;
     }
+  } break;
+  case 2: {
+    if (type == 1) ier = cufinufft2d1_exec<T>(d_c, d_fk, d_plan);
+    if (type == 2) ier = cufinufft2d2_exec<T>(d_c, d_fk, d_plan);
+    if (type == 3) {
+      std::cerr << "Not Implemented yet" << std::endl;
+      ier = FINUFFT_ERR_TYPE_NOTVALID;
+    }
+  } break;
+  case 3: {
+    if (type == 1) ier = cufinufft3d1_exec<T>(d_c, d_fk, d_plan);
+    if (type == 2) ier = cufinufft3d2_exec<T>(d_c, d_fk, d_plan);
+    if (type == 3) {
+      std::cerr << "Not Implemented yet" << std::endl;
+      ier = FINUFFT_ERR_TYPE_NOTVALID;
+    }
+  } break;
+  }
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int cufinufft_destroy_impl(cufinufft_plan_t<T> *d_plan)
 /*
     "destroy" stage (single and double precision versions).
@@ -435,21 +455,19 @@ int cufinufft_destroy_impl(cufinufft_plan_t<T> *d_plan)
         Also see ../docs/cppdoc.md for main user-facing documentation.
 */
 {
-    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
 
-    // Can't destroy a null pointer.
-    if (!d_plan)
-        return FINUFFT_ERR_PLAN_NOTVALID;
+  // Can't destroy a null pointer.
+  if (!d_plan) return FINUFFT_ERR_PLAN_NOTVALID;
 
-    using namespace cufinufft::memtransfer;
-    freegpumemory<T>(d_plan);
+  using namespace cufinufft::memtransfer;
+  freegpumemory<T>(d_plan);
 
-    if (d_plan->fftplan)
-        cufftDestroy(d_plan->fftplan);
+  if (d_plan->fftplan) cufftDestroy(d_plan->fftplan);
 
-    /* free/destruct the plan */
-    delete d_plan;
+  /* free/destruct the plan */
+  delete d_plan;
 
-    return 0;
+  return 0;
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/memtransfer.h b/include/cufinufft/memtransfer.h
index 382f911e9..4c4788b9d 100644
--- a/include/cufinufft/memtransfer.h
+++ b/include/cufinufft/memtransfer.h
@@ -6,20 +6,13 @@
 namespace cufinufft {
 namespace memtransfer {
 
-template <typename T>
-int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-void freegpumemory(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan);
+template<typename T> void freegpumemory(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan);
 
 } // namespace memtransfer
 } // namespace cufinufft
diff --git a/include/cufinufft/precision_independent.h b/include/cufinufft/precision_independent.h
index ff98506bf..9fa48a07e 100644
--- a/include/cufinufft/precision_independent.h
+++ b/include/cufinufft/precision_independent.h
@@ -6,8 +6,8 @@
 #define PRECISION_INDEPENDENT_H
 
 #include <cuComplex.h>
-#define rpart(x) (cuCreal(x))
-#define ipart(x) (cuCimag(x))
+#define rpart(x)    (cuCreal(x))
+#define ipart(x)    (cuCimag(x))
 #define cmplx(x, y) (make_cuDoubleComplex(x, y))
 namespace cufinufft {
 namespace common {
@@ -20,42 +20,51 @@ __device__ RT cabs(const CT &z);
 __device__ CT cpow(const CT &z, const int &n);
 
 /* Common Kernels from spreadinterp3d */
-__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony, int onz, int bnx, int bny,
-                                          int bnz);
-__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny, int nbinz);
+__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony,
+                                          int onz, int bnx, int bny, int bnz);
+__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny,
+                                    int nbinz);
 
 /* spreadinterp 1d */
-__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins);
+__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                int numbins);
 
-__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins);
+__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                      int *d_numsubprob, int numbins);
 
 __global__ void trivial_global_sort_index_1d(int M, int *index);
 
 /* spreadinterp 2d */
-__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins);
+__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                int numbins);
 
-__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins);
+__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                      int *d_numsubprob, int numbins);
 
 __global__ void trivial_global_sort_index_2d(int M, int *index);
 
 /* spreadinterp3d */
-__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins);
+__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                   int numbins);
 
-__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins);
+__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                         int *d_numsubprob, int numbins);
 
-__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size,
-                                   int *num_subprob, int maxsubprobsize, int numbins);
+__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz,
+                                   int *bin_size, int *num_subprob, int maxsubprobsize,
+                                   int numbins);
 
-__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, int *d_numsubprob,
-                                         int numbins);
+__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts,
+                                         int *d_numsubprob, int numbins);
 
 __global__ void trivial_global_sort_index_3d(int M, int *index);
 
-__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny,
-                                int nobinz, int *binsize);
+__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz,
+                                int nobinx, int nobiny, int nobinz, int *binsize);
 
-__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny,
-                                    int nobinz, int *binsize, int *index, int *binstartpts, int M);
+__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz,
+                                    int nobinx, int nobiny, int nobinz, int *binsize,
+                                    int *index, int *binstartpts, int M);
 } // namespace common
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
index 85850e92a..da1c59930 100644
--- a/include/cufinufft/spreadinterp.h
+++ b/include/cufinufft/spreadinterp.h
@@ -1,21 +1,20 @@
 #ifndef __CUSPREADINTERP_H__
 #define __CUSPREADINTERP_H__
 
+#include <cmath>
 #include <cufinufft/types.h>
 #include <finufft_spread_opts.h>
-#include <cmath>
 
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
-static __forceinline__ __device__ T fold_rescale(T x, int N) {
+template<typename T> static __forceinline__ __device__ T fold_rescale(T x, int N) {
   static constexpr const auto x2pi = T(0.159154943091895345554011992339482617);
-  const T result = x * x2pi + T(0.5);
-  return (result-floor(result)) * T(N);
+  const T result                   = x * x2pi + T(0.5);
+  return (result - floor(result)) * T(N);
 }
 
-template <typename T>
+template<typename T>
 static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
 /* ES ("exp sqrt") kernel evaluation at single real argument:
       phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)),    for |x| < nspread/2
@@ -23,17 +22,17 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
    approximation to prolate spheroidal wavefunction (PSWF) of order 0.
    This is the "reference implementation", used by eg common/onedim_* 2/17/17 */
 {
-    if (abs(x) >= opts.ES_halfwidth)
-        // if spreading/FT careful, shouldn't need this if, but causes no speed hit
-        return 0.0;
-    else
-        return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x));
+  if (abs(x) >= opts.ES_halfwidth)
+    // if spreading/FT careful, shouldn't need this if, but causes no speed hit
+    return 0.0;
+  else
+    return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x));
 }
 
-template <typename T>
+template<typename T>
 int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth);
 
-template <typename T>
+template<typename T>
 static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns)
 /* ES ("exp sqrt") kernel evaluation at single real argument:
    phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)),    for |x| < nspread/2
@@ -42,89 +41,95 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
    This is the "reference implementation", used by eg common/onedim_*
     2/17/17 */
 {
-    return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0;
+  return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0;
 }
 
-template <typename T>
-static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, const double upsampfac)
+template<typename T>
+static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
+                                                         const double upsampfac)
 /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
    x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
    This is the current evaluation method, since it's faster (except i7 w=16).
    Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
 {
-    T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1]
-    // insert the auto-generated code which expects z, w args, writes to ker...
-    if (upsampfac == 2.0) { // floating point equality is fine here
-        using FLT = T;
-        using CUFINUFFT_FLT = T;
+  T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1]
+  // insert the auto-generated code which expects z, w args, writes to ker...
+  if (upsampfac == 2.0) { // floating point equality is fine here
+    using FLT           = T;
+    using CUFINUFFT_FLT = T;
 #include "cufinufft/contrib/ker_horner_allw_loop.inc"
-    }
+  }
 }
 
-template <typename T>
-static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w, const T es_c, const T es_beta) {
-    for (int i = 0; i < w; i++) {
-        ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
-    }
+template<typename T>
+static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w,
+                                                  const T es_c, const T es_beta) {
+  for (int i = 0; i < w; i++) {
+    ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
+  }
 }
 
 // Functions for calling different methods of spreading & interpolation
-template <typename T>
-int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize);
 
-template <typename T>
-int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
 
 // Wrappers for methods of spreading
-template <typename T>
+template<typename T>
 int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
+template<typename T>
 int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
+template<typename T>
 int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
+template<typename T>
 int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
 
-template <typename T>
+template<typename T>
 int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
+template<typename T>
+int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
 int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
+template<typename T>
 int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T>
+int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
+                                cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
+int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
+                                cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
+int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
+                            cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize);
 
 // Wrappers for methods of interpolation
-template <typename T>
+template<typename T>
 int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
+template<typename T>
+int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
 int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T>
+int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
+int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
index 246b4aaa1..05b2f6c36 100644
--- a/include/cufinufft/types.h
+++ b/include/cufinufft/types.h
@@ -3,99 +3,88 @@
 
 #include <cufft.h>
 
+#include <cufinufft/defs.h>
 #include <cufinufft_opts.h>
 #include <finufft_spread_opts.h>
 #include <type_traits>
-#include <cufinufft/defs.h>
 
 #include <complex.h>
 
 #define CUFINUFFT_BIGINT int
 
 // Ugly trick to map a template to a fixed type, here cuda_complex<T>
-template <typename T>
-struct cuda_complex_impl;
-template <>
-struct cuda_complex_impl<float> {
-    using type = cuFloatComplex;
+template<typename T> struct cuda_complex_impl;
+template<> struct cuda_complex_impl<float> {
+  using type = cuFloatComplex;
 };
-template <>
-struct cuda_complex_impl<double> {
-    using type = cuDoubleComplex;
+template<> struct cuda_complex_impl<double> {
+  using type = cuDoubleComplex;
 };
 
-template <typename T>
-using cuda_complex = typename cuda_complex_impl<T>::type;
-
-template <typename T>
-struct cufinufft_plan_t {
-    cufinufft_opts opts;
-    finufft_spread_opts spopts;
-
-    int type;
-    int dim;
-    CUFINUFFT_BIGINT M;
-    CUFINUFFT_BIGINT nf1;
-    CUFINUFFT_BIGINT nf2;
-    CUFINUFFT_BIGINT nf3;
-    CUFINUFFT_BIGINT ms;
-    CUFINUFFT_BIGINT mt;
-    CUFINUFFT_BIGINT mu;
-    int ntransf;
-    int maxbatchsize;
-    int iflag;
-
-    int totalnumsubprob;
-    T *fwkerhalf1;
-    T *fwkerhalf2;
-    T *fwkerhalf3;
-
-    T *kx;
-    T *ky;
-    T *kz;
-    cuda_complex<T> *c;
-    cuda_complex<T> *fw;
-    cuda_complex<T> *fk;
-
-    // Arrays that used in subprob method
-    int *idxnupts;        // length: #nupts, index of the nupts in the bin-sorted order
-    int *sortidx;         // length: #nupts, order inside the bin the nupt belongs to
-    int *numsubprob;      // length: #bins,  number of subproblems in each bin
-    int *binsize;         // length: #bins, number of nonuniform ponits in each bin
-    int *binstartpts;     // length: #bins, exclusive scan of array binsize
-    int *subprob_to_bin;  // length: #subproblems, the bin the subproblem works on
-    int *subprobstartpts; // length: #bins, exclusive scan of array numsubprob
-
-    // Arrays for 3d (need to sort out)
-    int *numnupts;
-    int *subprob_to_nupts;
-
-    // Temporary variables to do fseries precomputation
-    std::complex<double> fseries_precomp_a[3 * MAX_NQUAD];
-    T fseries_precomp_f[3 * MAX_NQUAD];
-
-    cufftHandle fftplan;
-    cudaStream_t stream;
+template<typename T> using cuda_complex = typename cuda_complex_impl<T>::type;
+
+template<typename T> struct cufinufft_plan_t {
+  cufinufft_opts opts;
+  finufft_spread_opts spopts;
+
+  int type;
+  int dim;
+  CUFINUFFT_BIGINT M;
+  CUFINUFFT_BIGINT nf1;
+  CUFINUFFT_BIGINT nf2;
+  CUFINUFFT_BIGINT nf3;
+  CUFINUFFT_BIGINT ms;
+  CUFINUFFT_BIGINT mt;
+  CUFINUFFT_BIGINT mu;
+  int ntransf;
+  int maxbatchsize;
+  int iflag;
+
+  int totalnumsubprob;
+  T *fwkerhalf1;
+  T *fwkerhalf2;
+  T *fwkerhalf3;
+
+  T *kx;
+  T *ky;
+  T *kz;
+  cuda_complex<T> *c;
+  cuda_complex<T> *fw;
+  cuda_complex<T> *fk;
+
+  // Arrays that used in subprob method
+  int *idxnupts;        // length: #nupts, index of the nupts in the bin-sorted order
+  int *sortidx;         // length: #nupts, order inside the bin the nupt belongs to
+  int *numsubprob;      // length: #bins,  number of subproblems in each bin
+  int *binsize;         // length: #bins, number of nonuniform ponits in each bin
+  int *binstartpts;     // length: #bins, exclusive scan of array binsize
+  int *subprob_to_bin;  // length: #subproblems, the bin the subproblem works on
+  int *subprobstartpts; // length: #bins, exclusive scan of array numsubprob
+
+  // Arrays for 3d (need to sort out)
+  int *numnupts;
+  int *subprob_to_nupts;
+
+  // Temporary variables to do fseries precomputation
+  std::complex<double> fseries_precomp_a[3 * MAX_NQUAD];
+  T fseries_precomp_f[3 * MAX_NQUAD];
+
+  cufftHandle fftplan;
+  cudaStream_t stream;
 };
 
-template <typename T>
-static cufftType_t cufft_type();
-template <>
-inline cufftType_t cufft_type<float>() {
-    return CUFFT_C2C;
-}
+template<typename T> static cufftType_t cufft_type();
+template<> inline cufftType_t cufft_type<float>() { return CUFFT_C2C; }
 
-template <>
-inline cufftType_t cufft_type<double>() {
-    return CUFFT_Z2Z;
-}
+template<> inline cufftType_t cufft_type<double>() { return CUFFT_Z2Z; }
 
-static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata, cufftComplex *odata, int direction) {
-    return cufftExecC2C(plan, idata, odata, direction);
+static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata,
+                                   cufftComplex *odata, int direction) {
+  return cufftExecC2C(plan, idata, odata, direction);
 }
-static inline cufftResult cufft_ex(cufftHandle plan, cufftDoubleComplex *idata, cufftDoubleComplex *odata,
-                                   int direction) {
-    return cufftExecZ2Z(plan, idata, odata, direction);
+static inline cufftResult cufft_ex(cufftHandle plan, cufftDoubleComplex *idata,
+                                   cufftDoubleComplex *odata, int direction) {
+  return cufftExecZ2Z(plan, idata, odata, direction);
 }
 
 #endif
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index e8deb42e9..3455b99c0 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -15,59 +15,58 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 #else
 __inline__ __device__ double atomicAdd(double *address, double val) {
-    unsigned long long int *address_as_ull = (unsigned long long int *)address;
-    unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+  unsigned long long int old             = *address_as_ull, assumed;
 
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+  do {
+    assumed = old;
+    old     = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val + __longlong_as_double(assumed)));
 
-        // Note: uses integer comparison to avoid hang in case of NaN
-        // (since NaN != NaN)
-    } while (assumed != old);
+    // Note: uses integer comparison to avoid hang in case of NaN
+    // (since NaN != NaN)
+  } while (assumed != old);
 
-    return __longlong_as_double(old);
+  return __longlong_as_double(old);
 }
 #endif
 
 namespace cufinufft {
 namespace utils {
 class WithCudaDevice {
-  public:
-    WithCudaDevice(int device) {
-        cudaGetDevice(&orig_device_);
-        cudaSetDevice(device);
-    }
+public:
+  WithCudaDevice(int device) {
+    cudaGetDevice(&orig_device_);
+    cudaSetDevice(device);
+  }
 
-    ~WithCudaDevice() { cudaSetDevice(orig_device_); }
+  ~WithCudaDevice() { cudaSetDevice(orig_device_); }
 
-  private:
-    int orig_device_;
+private:
+  int orig_device_;
 };
 
 // jfm timer class
 class CNTime {
-  public:
-    void start();
-    double restart();
-    double elapsedsec();
+public:
+  void start();
+  double restart();
+  double elapsedsec();
 
-  private:
-    struct timeval initial;
+private:
+  struct timeval initial;
 };
 
 // ahb math helpers
 CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b);
 
-template <typename T>
-T infnorm(int n, std::complex<T> *a) {
-    T nrm = 0.0;
-    for (int m = 0; m < n; ++m) {
-        T aa = real(conj(a[m]) * a[m]);
-        if (aa > nrm)
-            nrm = aa;
-    }
-    return sqrt(nrm);
+template<typename T> T infnorm(int n, std::complex<T> *a) {
+  T nrm = 0.0;
+  for (int m = 0; m < n; ++m) {
+    T aa = real(conj(a[m]) * a[m]);
+    if (aa > nrm) nrm = aa;
+  }
+  return sqrt(nrm);
 }
 } // namespace utils
 } // namespace cufinufft
diff --git a/include/cufinufft_opts.h b/include/cufinufft_opts.h
index 9760884ae..0fb0d8f62 100644
--- a/include/cufinufft_opts.h
+++ b/include/cufinufft_opts.h
@@ -2,30 +2,30 @@
 #define __CUFINUFFT_OPTS_H__
 
 typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults
-    double upsampfac;           // upsampling ratio sigma, only 2.0 (standard) is implemented
-                                /* following options are for gpu */
-    int gpu_method;             // 1: nonuniform-pts driven, 2: shared mem (SM)
-    int gpu_sort;               // when NU-pts driven: 0: no sort (GM), 1: sort (GM-sort)
+  double upsampfac; // upsampling ratio sigma, only 2.0 (standard) is implemented
+                    /* following options are for gpu */
+  int gpu_method;   // 1: nonuniform-pts driven, 2: shared mem (SM)
+  int gpu_sort;     // when NU-pts driven: 0: no sort (GM), 1: sort (GM-sort)
 
-    int gpu_binsizex; // used for 2D, 3D subproblem method
-    int gpu_binsizey;
-    int gpu_binsizez;
+  int gpu_binsizex; // used for 2D, 3D subproblem method
+  int gpu_binsizey;
+  int gpu_binsizez;
 
-    int gpu_obinsizex; // used for 3D spread block gather method
-    int gpu_obinsizey;
-    int gpu_obinsizez;
+  int gpu_obinsizex; // used for 3D spread block gather method
+  int gpu_obinsizey;
+  int gpu_obinsizez;
 
-    int gpu_maxsubprobsize;
-    int gpu_kerevalmeth; // 0: direct exp(sqrt()), 1: Horner ppval
+  int gpu_maxsubprobsize;
+  int gpu_kerevalmeth;      // 0: direct exp(sqrt()), 1: Horner ppval
 
-    int gpu_spreadinterponly; // 0: NUFFT, 1: spread or interpolation only
+  int gpu_spreadinterponly; // 0: NUFFT, 1: spread or interpolation only
 
-    int gpu_maxbatchsize;
+  int gpu_maxbatchsize;
 
-    /* multi-gpu support */
-    int gpu_device_id;
+  /* multi-gpu support */
+  int gpu_device_id;
 
-    void *gpu_stream;
+  void *gpu_stream;
 } cufinufft_opts;
 
 #endif
diff --git a/include/finufft.h b/include/finufft.h
index 71a38f9be..487a3eb4f 100644
--- a/include/finufft.h
+++ b/include/finufft.h
@@ -5,7 +5,6 @@
 // They will clobber any prior macros starting FINUFFT*, so in the lib/test
 // sources finufft.h must be included before defs.h
 
-
 /* Devnotes.
    A) Two precisions done by including the "either precision" headers twice.
    No use of the private headers for lib/test/example compilation is made.
@@ -37,7 +36,7 @@
 #define FINUFFT_BIGINT int64_t
 
 #ifndef __cplusplus
-#include <stdbool.h>     // for bool type in C (needed for item in plan struct)
+#include <stdbool.h> // for bool type in C (needed for item in plan struct)
 #endif
 
 // this macro name has to be safe since exposed to user
@@ -50,4 +49,4 @@
 // clean up any purely local defs that are not in finufft_eitherprec.h...
 #undef FINUFFT_BIGINT
 
-#endif  // FINUFFT_H
+#endif // FINUFFT_H
diff --git a/include/finufft/dirft.h b/include/finufft/dirft.h
index 88f1dd2df..5d13265a4 100644
--- a/include/finufft/dirft.h
+++ b/include/finufft/dirft.h
@@ -3,16 +3,20 @@
 
 #include <finufft/defs.h>
 
-void dirft1d1(BIGINT nj,FLT* x,CPX* c,int isign,BIGINT ms, CPX* f);
-void dirft1d2(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f);
-void dirft1d3(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT nk, FLT* s, CPX* f);
+void dirft1d1(BIGINT nj, FLT *x, CPX *c, int isign, BIGINT ms, CPX *f);
+void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f);
+void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f);
 
-void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f);
-void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f);
-void dirft2d3(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, CPX* f);
+void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f);
+void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f);
+void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t,
+              CPX *f);
 
-void dirft3d1(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f);
-void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f);
-void dirft3d3(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, FLT *u, CPX* f);
+void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
+              BIGINT mu, CPX *f);
+void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
+              BIGINT mu, CPX *f);
+void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s,
+              FLT *t, FLT *u, CPX *f);
 
 #endif
diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h
index 89d86f0de..1771ff259 100644
--- a/include/finufft/fftw_defs.h
+++ b/include/finufft/fftw_defs.h
@@ -7,42 +7,42 @@
 // precision library compilation, which need different FFTW command symbols.
 // Barnett simplified via FFTWIFY, 6/7/22.
 
-#include <fftw3.h>          // (after complex.h) needed so can typedef FFTW_CPX
+#include <fftw3.h> // (after complex.h) needed so can typedef FFTW_CPX
 
 // precision-switching names for interfaces to FFTW...
 #ifdef SINGLE
-  // macro to prepend fftw_ (for double) or fftwf_ (for single) to a string
-  // without a space. The 2nd level of indirection is needed for safety, see:
-  // https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
-  #define FFTWIFY_UNSAFE(x) fftwf_##x
+// macro to prepend fftw_ (for double) or fftwf_ (for single) to a string
+// without a space. The 2nd level of indirection is needed for safety, see:
+// https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
+#define FFTWIFY_UNSAFE(x) fftwf_##x
 #else
-  #define FFTWIFY_UNSAFE(x) fftw_##x
+#define FFTWIFY_UNSAFE(x) fftw_##x
 #endif
-#define FFTWIFY(x) FFTWIFY_UNSAFE(x)
+#define FFTWIFY(x)         FFTWIFY_UNSAFE(x)
 // now use this tool (note we replaced typedefs v<=2.0.4, in favor of macros):
-#define FFTW_CPX FFTWIFY(complex)
-#define FFTW_PLAN FFTWIFY(plan)
-#define FFTW_ALLOC_RE FFTWIFY(alloc_real)
-#define FFTW_ALLOC_CPX FFTWIFY(alloc_complex)
-#define FFTW_PLAN_1D FFTWIFY(plan_dft_1d)
-#define FFTW_PLAN_2D FFTWIFY(plan_dft_2d)
-#define FFTW_PLAN_3D FFTWIFY(plan_dft_3d)
+#define FFTW_CPX           FFTWIFY(complex)
+#define FFTW_PLAN          FFTWIFY(plan)
+#define FFTW_ALLOC_RE      FFTWIFY(alloc_real)
+#define FFTW_ALLOC_CPX     FFTWIFY(alloc_complex)
+#define FFTW_PLAN_1D       FFTWIFY(plan_dft_1d)
+#define FFTW_PLAN_2D       FFTWIFY(plan_dft_2d)
+#define FFTW_PLAN_3D       FFTWIFY(plan_dft_3d)
 #define FFTW_PLAN_MANY_DFT FFTWIFY(plan_many_dft)
-#define FFTW_EX FFTWIFY(execute)
-#define FFTW_DE FFTWIFY(destroy_plan)
-#define FFTW_FR FFTWIFY(free)
+#define FFTW_EX            FFTWIFY(execute)
+#define FFTW_DE            FFTWIFY(destroy_plan)
+#define FFTW_FR            FFTWIFY(free)
 #define FFTW_FORGET_WISDOM FFTWIFY(forget_wisdom)
-#define FFTW_CLEANUP FFTWIFY(cleanup)
+#define FFTW_CLEANUP       FFTWIFY(cleanup)
 // the following OMP switch could be done in the src code instead...
 #ifdef _OPENMP
-  #define FFTW_INIT FFTWIFY(init_threads)
-  #define FFTW_PLAN_TH FFTWIFY(plan_with_nthreads)
-  #define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads)
+#define FFTW_INIT            FFTWIFY(init_threads)
+#define FFTW_PLAN_TH         FFTWIFY(plan_with_nthreads)
+#define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads)
 #else
-  // no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls...
-  #define FFTW_INIT()
-  #define FFTW_PLAN_TH(x)
-  #define FFTW_CLEANUP_THREADS()
+// no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls...
+#define FFTW_INIT()
+#define FFTW_PLAN_TH(x)
+#define FFTW_CLEANUP_THREADS()
 #endif
 
-#endif  // FFTW_DEFS_H
+#endif // FFTW_DEFS_H
diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h
index 853b6c2b1..0900dd31b 100644
--- a/include/finufft/spreadinterp.h
+++ b/include/finufft/spreadinterp.h
@@ -26,32 +26,38 @@
 #define TF_OMIT_SPREADING            8 // don't interp/spread (dir=1: to subgrids)
 
 namespace finufft {
-  namespace spreadinterp {
+namespace spreadinterp {
 
 // things external (spreadinterp) interface needs...
-FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform,
-		 BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		 FLT *data_nonuniform, finufft_spread_opts opts);
-FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3,
-                 BIGINT M, FLT *kx, FLT *ky, FLT *kz, finufft_spread_opts opts);
-FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, 
-               FLT *kx, FLT *ky, FLT *kz, finufft_spread_opts opts);
-FINUFFT_EXPORT int FINUFFT_CDECL interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, 
-		      FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		 FLT *data_nonuniform, finufft_spread_opts opts, int did_sort);
-FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, 
-		      FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		 FLT *data_nonuniform, finufft_spread_opts opts, int did_sort);
-FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, 
-                       FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-                       FLT *data_nonuniform, finufft_spread_opts opts,
-                       int did_sort);
-FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x,const finufft_spread_opts &opts);
-FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel_noexp(FLT x,const finufft_spread_opts &opts);
-FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts,FLT eps,double upsampfac,
-                   int kerevalmeth, int debug, int showwarn, int dim);
-
-  }    // namespace
-}    // namespace
- 
-#endif  // SPREADINTERP_H
+FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(
+    BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky,
+    FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts);
+FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
+                                             FLT *kx, FLT *ky, FLT *kz,
+                                             finufft_spread_opts opts);
+FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2,
+                                           BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
+                                           finufft_spread_opts opts);
+FINUFFT_EXPORT int FINUFFT_CDECL interpSorted(
+    BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M,
+    FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts,
+    int did_sort);
+FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted(
+    BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M,
+    FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts,
+    int did_sort);
+FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(
+    BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M,
+    FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts,
+    int did_sort);
+FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts);
+FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel_noexp(FLT x,
+                                                       const finufft_spread_opts &opts);
+FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, FLT eps,
+                                                double upsampfac, int kerevalmeth,
+                                                int debug, int showwarn, int dim);
+
+} // namespace spreadinterp
+} // namespace finufft
+
+#endif // SPREADINTERP_H
diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h
index 54b058266..6142eadfb 100644
--- a/include/finufft/test_defs.h
+++ b/include/finufft/test_defs.h
@@ -7,7 +7,7 @@
 
 // TESTER SETTINGS...
 // how big a problem to check direct DFT for in 1D...
-#define TEST_BIGPROB 1e8
+#define TEST_BIGPROB   1e8
 // for omp rand filling
 #define TEST_RANDCHUNK 1000000
 
@@ -25,11 +25,11 @@
 #include <finufft/fftw_defs.h>
 
 // std stuff for tester src
-#include <math.h>
-#include <stdlib.h>
 #include <cstdio>
-#include <iostream>
 #include <iomanip>
+#include <iostream>
+#include <math.h>
+#include <stdlib.h>
 #include <vector>
 
-#endif   // TEST_DEFS_H
+#endif // TEST_DEFS_H
diff --git a/include/finufft/utils.h b/include/finufft/utils.h
index 8c2b7619e..9039fee96 100644
--- a/include/finufft/utils.h
+++ b/include/finufft/utils.h
@@ -7,18 +7,19 @@
 #include "finufft/defs.h"
 
 namespace finufft {
-  namespace utils {
+namespace utils {
 
 // ahb's low-level array helpers
-FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX* a, CPX* b);
-FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX* a, CPX* b);
-FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX* a);
-FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX* a);
-FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi);
-FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT* i, FLT* a, FLT *lo, FLT *hi);
-FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c);
+FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX *a, CPX *b);
+FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX *a, CPX *b);
+FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX *a);
+FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX *a);
+FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi);
+FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT *i, FLT *a, FLT *lo,
+                                                    FLT *hi);
+FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c);
 
-  }    // namespace
-}    // namespace
- 
-#endif  // UTILS_H
+} // namespace utils
+} // namespace finufft
+
+#endif // UTILS_H
diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h
index 866d33198..0504bb8df 100644
--- a/include/finufft/utils_precindep.h
+++ b/include/finufft/utils_precindep.h
@@ -10,34 +10,35 @@
 #include <chrono>
 
 namespace finufft {
-  namespace utils {
-  
-  FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n);
-
-  // jfm's timer class
-  class FINUFFT_EXPORT CNTime {
-  public:
-    void start();
-    double restart();
-    double elapsedsec();
-  private:
-    double initial;
-  };
-
-  // openmp helpers
-  int get_num_threads_parallel_block();
-    
-  } //namespace
-} //namespace
-  
+namespace utils {
+
+FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n);
+
+// jfm's timer class
+class FINUFFT_EXPORT CNTime {
+public:
+  void start();
+  double restart();
+  double elapsedsec();
+
+private:
+  double initial;
+};
+
+// openmp helpers
+int get_num_threads_parallel_block();
+
+} // namespace utils
+} // namespace finufft
+
 // thread-safe rand number generator for Windows platform
 #ifdef _WIN32
 #include <random>
 namespace finufft {
-  namespace utils {
-  FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp);
-  }   // namespace
-}   // namespace
+namespace utils {
+FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp);
+} // namespace utils
+} // namespace finufft
 #endif
 
-#endif  // UTILS_PRECINDEP_H
+#endif // UTILS_PRECINDEP_H
diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h
index 25703fb1d..250dec7c0 100644
--- a/include/finufft_eitherprec.h
+++ b/include/finufft_eitherprec.h
@@ -15,26 +15,26 @@
 // The 2nd level of indirection is needed for safety, see:
 // https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
 #define FINUFFTIFY_UNSAFE(x) finufftf##x
-#define FINUFFT_FLT float
+#define FINUFFT_FLT          float
 #else
 #define FINUFFTIFY_UNSAFE(x) finufft##x
-#define FINUFFT_FLT double
+#define FINUFFT_FLT          double
 #endif
 #define FINUFFTIFY(x) FINUFFTIFY_UNSAFE(x)
 
 // decide which kind of complex numbers FINUFFT_CPX is (four options)
 #ifdef __cplusplus
 #define _USE_MATH_DEFINES
-#include <complex>          // C++ type
+#include <complex> // C++ type
 #define FINUFFT_COMPLEXIFY(X) std::complex<X>
 #else
-#include <complex.h>        // C99 type
+#include <complex.h> // C99 type
 #define FINUFFT_COMPLEXIFY(X) X complex
 #endif
-#define FINUFFT_CPX FINUFFT_COMPLEXIFY(FINUFFT_FLT)
+#define FINUFFT_CPX    FINUFFT_COMPLEXIFY(FINUFFT_FLT)
 
 // opaque pointer to finufft_plan private object, for this precision...
-#define FINUFFT_PLAN FINUFFTIFY(_plan)
+#define FINUFFT_PLAN   FINUFFTIFY(_plan)
 // the plan object pointed to... (doesn't need to be even defined here)
 #define FINUFFT_PLAN_S FINUFFTIFY(_plan_s)
 
@@ -51,13 +51,13 @@
    with it in the future we just need to update cmake for it to work
    instead of having a check on the msvc version. */
 #if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__))
-#  if defined(dll_EXPORTS)
-#    define FINUFFT_EXPORT __declspec(dllexport)
-#  else
-#    define FINUFFT_EXPORT __declspec(dllimport)
-#  endif
+#if defined(dll_EXPORTS)
+#define FINUFFT_EXPORT __declspec(dllexport)
 #else
-#  define FINUFFT_EXPORT
+#define FINUFFT_EXPORT __declspec(dllimport)
+#endif
+#else
+#define FINUFFT_EXPORT
 #endif
 
 /* specify calling convention (Windows only)
@@ -66,81 +66,115 @@
    If the user code changes the default compiler calling convention, may need
    this when generating DLL. */
 #if defined(_WIN32) || defined(__WIN32__)
-#  define FINUFFT_CDECL __cdecl
+#define FINUFFT_CDECL __cdecl
 #else
-#  define FINUFFT_CDECL
+#define FINUFFT_CDECL
 #endif
 
 ////////////////////////////////////////////////////////////////////
 // PUBLIC METHOD INTERFACES. All are C-style even when used from C++...
 #ifdef __cplusplus
-extern "C"
-{
+extern "C" {
 #endif
 
-// ----------------- the plan ----------------------------------------------- 
+// ----------------- the plan -----------------------------------------------
 // the plan handle that we pass around is just a pointer to the plan object
 // that contains all the info. The latter is invisible to the public user.
-typedef struct FINUFFT_PLAN_S * FINUFFT_PLAN;
+typedef struct FINUFFT_PLAN_S *FINUFFT_PLAN;
 
-  
 // ------------------ the guru interface ------------------------------------
 // (sources in finufft.cpp)
-  
-  FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)(int type, int dim, FINUFFT_BIGINT* n_modes, int iflag, int n_transf, FINUFFT_FLT tol, FINUFFT_PLAN* plan, finufft_opts* o);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)(FINUFFT_PLAN plan , FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_execute)(FINUFFT_PLAN plan, FINUFFT_CPX* weights, FINUFFT_CPX* result);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_destroy)(FINUFFT_PLAN plan);
 
+FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)(
+    int type, int dim, FINUFFT_BIGINT *n_modes, int iflag, int n_transf, FINUFFT_FLT tol,
+    FINUFFT_PLAN *plan, finufft_opts *o);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)(
+    FINUFFT_PLAN plan, FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj,
+    FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_execute)(
+    FINUFFT_PLAN plan, FINUFFT_CPX *weights, FINUFFT_CPX *result);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_destroy)(FINUFFT_PLAN plan);
 
 // ----------------- the 18 simple interfaces -------------------------------
 // (sources in simpleinterfaces.cpp)
 
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms,
-                      FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms,
-                         FINUFFT_CPX* fk, finufft_opts *opts);
-
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms,
-                      FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms,
-                          FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_CPX* c,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_CPX* f, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_CPX* c,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_CPX* f, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1many)(int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT* xj, FINUFFT_FLT *yj, FINUFFT_CPX* c, int iflag,
-                   FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2many)(int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT* xj, FINUFFT_FLT *yj, FINUFFT_CPX* c, int iflag,
-                   FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_CPX* fk, finufft_opts *opts);
-
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_CPX* fk, finufft_opts *opts);
-
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1many)(int ntransfs, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts);
-
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_FLT *z, FINUFFT_CPX* cj,int iflag,
-	       FINUFFT_FLT eps,FINUFFT_BIGINT nk,FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_FLT *u,
-	       FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_FLT *z, FINUFFT_CPX* cj,int iflag,
-	       FINUFFT_FLT eps,FINUFFT_BIGINT nk,FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_FLT *u,
-	       FINUFFT_CPX* fk, finufft_opts *opts);
-  
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps,
+    FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts);
+
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps,
+    FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_CPX *c, int iflag, FINUFFT_FLT eps,
+    FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_CPX *f, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_CPX *c, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_CPX *f,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1many)(
+    int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *c,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2many)(
+    int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *c,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_CPX *cj,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t,
+    FINUFFT_CPX *fk, finufft_opts *opts);
+
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_CPX *cj,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu,
+    FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1many)(
+    int ntransfs, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj,
+    FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt,
+    FINUFFT_BIGINT mu, FINUFFT_CPX *fk, finufft_opts *opts);
+
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_CPX *cj,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu,
+    FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj,
+    FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt,
+    FINUFFT_BIGINT mu, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_FLT *z, FINUFFT_CPX *cj,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t,
+    FINUFFT_FLT *u, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_FLT *z,
+    FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s,
+    FINUFFT_FLT *t, FINUFFT_FLT *u, FINUFFT_CPX *fk, finufft_opts *opts);
+
 #ifdef __cplusplus
 }
 #endif
 
-
 // clean up things that were purely local to this file
 #undef FINUFFT_COMPLEXIFY
 #undef FINUFFTIFY_UNSAFE
diff --git a/include/finufft_opts.h b/include/finufft_opts.h
index 4f6db1e02..289e779e5 100644
--- a/include/finufft_opts.h
+++ b/include/finufft_opts.h
@@ -5,19 +5,18 @@
 #ifndef FINUFFT_OPTS_H
 #define FINUFFT_OPTS_H
 
-
-typedef struct finufft_opts{  // defaults see finufft.cpp:finufft_default_opts()
+typedef struct finufft_opts { // defaults see finufft.cpp:finufft_default_opts()
   // sphinx tag (don't remove): @opts_start
   // FINUFFT options:
   // data handling opts...
-  int modeord;            // (type 1,2 only): 0 CMCL-style increasing mode order
-                          //                  1 FFT-style mode order
-  int chkbnds;            // [DEPRECATED] 0 don't check NU pts in [-3pi,3pi), 1 do (<few % slower)
-  
+  int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
+               //                  1 FFT-style mode order
+  int chkbnds; // [DEPRECATED] 0 don't check NU pts in [-3pi,3pi), 1 do (<few % slower)
+
   // diagnostic opts...
-  int debug;              // 0 silent, 1 some timing/debug, or 2 more
-  int spread_debug;       // spreader: 0 silent, 1 some timing/debug, or 2 tonnes
-  int showwarn;           // 0 don't print warnings to stderr, 1 do
+  int debug;        // 0 silent, 1 some timing/debug, or 2 more
+  int spread_debug; // spreader: 0 silent, 1 some timing/debug, or 2 tonnes
+  int showwarn;     // 0 don't print warnings to stderr, 1 do
 
   // algorithm performance opts...
   int nthreads;           // number of threads to use, or 0 uses all available
@@ -38,14 +37,20 @@ typedef struct finufft_opts{  // defaults see finufft.cpp:finufft_default_opts()
 
 // define deprecated opts macro
 #if defined(__cplusplus) && (__cplusplus >= 201402L)
-#define DEPRECATED_OPTS [[deprecated ("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead.")]]
+#define DEPRECATED_OPTS                                                                 \
+  [[deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please " \
+               "use this instead.")]]
 #elif defined(_MSC_VER)
-#define DEPRECATED_OPTS __declspec(deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead."))
+#define DEPRECATED_OPTS                                                     \
+  __declspec(deprecated("as of v2.1.0, nufft_opts is obsolete and renamed " \
+                        "finufft_opts; please use this instead."))
 #else
-#define DEPRECATED_OPTS __attribute__((deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead.")))
+#define DEPRECATED_OPTS                                                         \
+  __attribute__((deprecated("as of v2.1.0, nufft_opts is obsolete and renamed " \
+                            "finufft_opts; please use this instead.")))
 #endif
 
 // Backwards-compatibility
 DEPRECATED_OPTS typedef finufft_opts nufft_opts;
 
-#endif  // FINUFFT_OPTS_H
+#endif // FINUFFT_OPTS_H
diff --git a/include/finufft_spread_opts.h b/include/finufft_spread_opts.h
index 8549505db..2f3c9ce76 100644
--- a/include/finufft_spread_opts.h
+++ b/include/finufft_spread_opts.h
@@ -10,25 +10,25 @@
 typedef struct finufft_spread_opts {
   // See spreadinterp:setup_spreader for default values of the following fields.
   // This is the main documentation for these options...
-  int nspread;            // w, the kernel width in grid pts
-  int spread_direction;   // 1 means spread NU->U, 2 means interpolate U->NU
-  int chkbnds;            // [DEPRECATED] 0: don't check NU pts in 3-period range; 1: do
-  int sort;               // 0: don't sort NU pts, 1: do, 2: heuristic choice
-  int kerevalmeth;        // 0: direct exp(sqrt()), or 1: Horner ppval, fastest
-  int kerpad;             // 0: no pad w to mult of 4, 1: do pad
-                          // (this helps SIMD for kerevalmeth=0, eg on i7).
-  int nthreads;           // # threads for spreadinterp (0: use max avail)
-  int sort_threads;       // # threads for sort (0: auto-choice up to nthreads)
+  int nspread;             // w, the kernel width in grid pts
+  int spread_direction;    // 1 means spread NU->U, 2 means interpolate U->NU
+  int chkbnds;             // [DEPRECATED] 0: don't check NU pts in 3-period range; 1: do
+  int sort;                // 0: don't sort NU pts, 1: do, 2: heuristic choice
+  int kerevalmeth;         // 0: direct exp(sqrt()), or 1: Horner ppval, fastest
+  int kerpad;              // 0: no pad w to mult of 4, 1: do pad
+                           // (this helps SIMD for kerevalmeth=0, eg on i7).
+  int nthreads;            // # threads for spreadinterp (0: use max avail)
+  int sort_threads;        // # threads for sort (0: auto-choice up to nthreads)
   int max_subproblem_size; // # pts per t1 subprob; sets extra RAM per thread
-  int flags;              // binary flags for timing only (may give wrong ans
-                          // if changed from 0!). See spreadinterp.h
-  int debug;              // 0: silent, 1: small text output, 2: verbose
-  int atomic_threshold;   // num threads before switching spreadSorted to using atomic ops
-  double upsampfac;       // sigma, upsampling factor
+  int flags;               // binary flags for timing only (may give wrong ans
+                           // if changed from 0!). See spreadinterp.h
+  int debug;               // 0: silent, 1: small text output, 2: verbose
+  int atomic_threshold; // num threads before switching spreadSorted to using atomic ops
+  double upsampfac;     // sigma, upsampling factor
   // ES kernel specific consts for eval. No longer FLT, to avoid name clash...
   double ES_beta;
   double ES_halfwidth;
   double ES_c;
 } finufft_spread_opts;
 
-#endif   // FINUFFT_SPREAD_OPTS_H
+#endif // FINUFFT_SPREAD_OPTS_H
diff --git a/matlab/finufft.cpp b/matlab/finufft.cpp
index 9a805dade..ccbc9a59a 100644
--- a/matlab/finufft.cpp
+++ b/matlab/finufft.cpp
@@ -31,9 +31,9 @@
   THE SOFTWARE.
 */
 
+#include <stddef.h>
 #include <stdio.h>
 #include <string.h>
-#include <stddef.h>
 
 #include <mex.h>
 
@@ -41,12 +41,10 @@
 #include <matrix.h>
 #endif
 
-
 /*
  * Records for call profile.
  */
-int* mexprofrecord_= NULL;
-
+int *mexprofrecord_ = NULL;
 
 /*
  * Support routines for copying data into and out of the MEX stubs, R2018a
@@ -54,502 +52,421 @@ int* mexprofrecord_= NULL;
 
 #if MX_HAS_INTERLEAVED_COMPLEX
 
-void* mxWrapGetP(const mxArray* a, const char* fmt, const char** e)
-{
-    void* p = NULL;
+void *mxWrapGetP(const mxArray *a, const char *fmt, const char **e) {
+  void *p = NULL;
 #ifdef R2008OO
-    mxArray* ap;
+  mxArray *ap;
 #endif
-    if (mxGetClassID(a) == mxDOUBLE_CLASS && mxIsComplex(a) )
-    {
-        if( mxGetM(a)*mxGetN(a) == 1 && (*mxGetComplexDoubles(a)).real == 0 )
-        return NULL;
-    }
-    if (mxGetClassID(a) == mxDOUBLE_CLASS && !mxIsComplex(a) )
-    {
-        if( mxGetM(a)*mxGetN(a) == 1 && *mxGetDoubles(a) == 0)
-        return NULL;
-    }
-    if (mxIsChar(a)) {
-        char pbuf[128];
-        mxGetString(a, pbuf, sizeof(pbuf));
-        sscanf(pbuf, fmt, &p);
-    } 
+  if (mxGetClassID(a) == mxDOUBLE_CLASS && mxIsComplex(a)) {
+    if (mxGetM(a) * mxGetN(a) == 1 && (*mxGetComplexDoubles(a)).real == 0) return NULL;
+  }
+  if (mxGetClassID(a) == mxDOUBLE_CLASS && !mxIsComplex(a)) {
+    if (mxGetM(a) * mxGetN(a) == 1 && *mxGetDoubles(a) == 0) return NULL;
+  }
+  if (mxIsChar(a)) {
+    char pbuf[128];
+    mxGetString(a, pbuf, sizeof(pbuf));
+    sscanf(pbuf, fmt, &p);
+  }
 #ifdef R2008OO
-    else if (ap = mxGetProperty(a, 0, "mwptr")) {
-        return mxWrapGetP(ap, fmt, e);
-    }
+  else if (ap = mxGetProperty(a, 0, "mwptr")) {
+    return mxWrapGetP(ap, fmt, e);
+  }
 #endif
-    if (p == 0)
-        *e = "Invalid pointer";
-    return p;
-}
-
-mxArray* mxWrapCreateP(void* p, const char* fmt)
-{
-    if (p == 0) {
-        mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL);
-        *mxGetDoubles(z) = 0;
-        return z;
-    } else {
-        char pbuf[128];
-        sprintf(pbuf, fmt, p);
-        return mxCreateString(pbuf);
-    }
-}
-
-mxArray* mxWrapStrncpy(const char* s)
-{
-    if (s) {
-        return mxCreateString(s);
-    } else {
-        mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL);
-        *mxGetDoubles(z) = 0;
-        return z;
-    }
-}
-
-char* mxWrapGetString(const mxArray* a, const char** e)
-{
-    char* s;
-    mwSize slen;
-    if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) {
-        *e = "Invalid string argument";
-        return NULL;
-    }
-    slen = mxGetM(a)*mxGetN(a) + 1;
-    s = (char*) mxMalloc(slen);
-    if (mxGetM(a)*mxGetN(a) == 0)
-        *s = 0;
-    else
-        mxGetString(a, s, slen);
-    return s;
-}
-
-
-double mxWrapGetScalar(const mxArray* a, const char** e)
-{
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a)*mxGetN(a) != 1) {
-        *e = "Invalid scalar argument";
-        return 0;
-    }
-    if( mxIsComplex(a) )
-      return (double) (*mxGetComplexDoubles(a)).real;
-    else
-      return (double) (*mxGetDoubles(a));
-}
-
-#define mxWrapGetArrayDef(func, T) \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    double* q; \
-    mxComplexDouble* z; \
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \
-        *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexDoubles(a);	   \
-	for (i = 0; i < arraylen; ++i)		\
-	  *p++ = (T) (*z++).real;			\
-      } \
-    else \
-      {				   \
-	q = mxGetDoubles(a);	   \
-	for (i = 0; i < arraylen; ++i)		\
-	  *p++ = (T) (*q++);			\
-      } \
-    return array; \
-}
-
-
-#define mxWrapCopyDef(func, T) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p;	\
-    mxComplexDouble* z; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexDoubles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  (*z++).real = (double) *q++;	\
-	  (*z++).imag = 0;	\
-      } \
-    else \
-      {				   \
-	p = mxGetDoubles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  *p++ = (double) *q++;		\
-      } \
-}
-
-
-#define mxWrapReturnDef(func, T) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p; \
-    if (!q) { \
-        return mxCreateDoubleMatrix(0,0, mxREAL); \
-    } else { \
-        mxArray* a = mxCreateDoubleMatrix(m,n, mxREAL); \
-        p = mxGetDoubles(a); \
-        for (i = 0; i < m*n; ++i) \
-	  *p++ = (double) *q++;	  \
-        return a; \
-    } \
-}
-
-
-#define mxWrapGetScalarZDef(func, T, ZT, setz)	\
-void func(T* z, const mxArray* a) \
-{ \
-    if( mxIsComplex(a) ) \
-      { \
-  setz(z, (ZT) (*mxGetComplexDoubles(a)).real, (ZT) (*mxGetComplexDoubles(a)).imag); \
-      } \
-    else \
-      {				   \
-  setz(z, (ZT) (*mxGetComplexDoubles(a)).real, (ZT) 0);	\
-      } \
-}
-
-
-#define mxWrapGetArrayZDef(func, T, ZT, setz)      \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    double* q; \
-    mxComplexDouble* z; \
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \
-        *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexDoubles(a);	   \
-	for (i = 0; i < arraylen; ++i) {	\
-	  setz(p, (ZT) (*z).real, (ZT) (*z).imag);	\
-  	  ++p; ++z; }					\
-      } \
-    else \
-      {				   \
-	q = mxGetDoubles(a);	   \
-	for (i = 0; i < arraylen; ++i)	{	\
-	  setz(p, (ZT) (*q), (ZT) 0 );		\
-          ++p; ++q; }			\
-      }						\
-    return array; \
-}
-
-
-#define mxWrapCopyZDef(func, T, freal, fimag)	    \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p;	\
-    mxComplexDouble* z; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexDoubles(a);	   \
-	for (i = 0; i < n; ++i)	{		\
-          (*z).real = freal(*q);			\
-	  (*z).imag = fimag(*q);			\
-	  ++z; ++q; 	}			\
-      } \
-    else \
-      {				   \
-	p = mxGetDoubles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  *p++ = freal(*q++);		\
-      } \
-}
-
-
-#define mxWrapReturnZDef(func, T, freal, fimag)	      \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    mxComplexDouble* p; \
-    if (!q) { \
-        return mxCreateDoubleMatrix(0,0, mxCOMPLEX); \
-    } else { \
-        mxArray* a = mxCreateDoubleMatrix(m,n, mxCOMPLEX); \
-        p = mxGetComplexDoubles(a); \
-        for (i = 0; i < m*n; ++i) {	  \
-          (*p).real = freal(*q);			\
-	  (*p).imag = fimag(*q);			\
-	  ++p; ++q; 	}			\
-        return a; \
-    } \
-}
-
-
-
-
-
-
-void* mxWrapGetP_single(const mxArray* a, const char* fmt, const char** e)
-{
-    void* p = NULL;
+  if (p == 0) *e = "Invalid pointer";
+  return p;
+}
+
+mxArray *mxWrapCreateP(void *p, const char *fmt) {
+  if (p == 0) {
+    mxArray *z       = mxCreateDoubleMatrix(1, 1, mxREAL);
+    *mxGetDoubles(z) = 0;
+    return z;
+  } else {
+    char pbuf[128];
+    sprintf(pbuf, fmt, p);
+    return mxCreateString(pbuf);
+  }
+}
+
+mxArray *mxWrapStrncpy(const char *s) {
+  if (s) {
+    return mxCreateString(s);
+  } else {
+    mxArray *z       = mxCreateDoubleMatrix(1, 1, mxREAL);
+    *mxGetDoubles(z) = 0;
+    return z;
+  }
+}
+
+char *mxWrapGetString(const mxArray *a, const char **e) {
+  char *s;
+  mwSize slen;
+  if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) {
+    *e = "Invalid string argument";
+    return NULL;
+  }
+  slen = mxGetM(a) * mxGetN(a) + 1;
+  s    = (char *)mxMalloc(slen);
+  if (mxGetM(a) * mxGetN(a) == 0)
+    *s = 0;
+  else
+    mxGetString(a, s, slen);
+  return s;
+}
+
+double mxWrapGetScalar(const mxArray *a, const char **e) {
+  if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a) * mxGetN(a) != 1) {
+    *e = "Invalid scalar argument";
+    return 0;
+  }
+  if (mxIsComplex(a))
+    return (double)(*mxGetComplexDoubles(a)).real;
+  else
+    return (double)(*mxGetDoubles(a));
+}
+
+#define mxWrapGetArrayDef(func, T)                               \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    double *q;                                                   \
+    mxComplexDouble *z;                                          \
+    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) {               \
+      *e = "Invalid array argument, mxDOUBLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    if (mxIsComplex(a)) {                                        \
+      z = mxGetComplexDoubles(a);                                \
+      for (i = 0; i < arraylen; ++i) *p++ = (T)(*z++).real;      \
+    } else {                                                     \
+      q = mxGetDoubles(a);                                       \
+      for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++);           \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyDef(func, T)                            \
+  void func(mxArray *a, const T *q, mwSize n) {           \
+    mwIndex i;                                            \
+    double *p;                                            \
+    mxComplexDouble *z;                                   \
+    if (mxIsComplex(a)) {                                 \
+      z = mxGetComplexDoubles(a);                         \
+      for (i = 0; i < n; ++i) (*z++).real = (double)*q++; \
+      (*z++).imag = 0;                                    \
+    } else {                                              \
+      p = mxGetDoubles(a);                                \
+      for (i = 0; i < n; ++i) *p++ = (double)*q++;        \
+    }                                                     \
+  }
+
+#define mxWrapReturnDef(func, T)                       \
+  mxArray *func(const T *q, mwSize m, mwSize n) {      \
+    mwIndex i;                                         \
+    double *p;                                         \
+    if (!q) {                                          \
+      return mxCreateDoubleMatrix(0, 0, mxREAL);       \
+    } else {                                           \
+      mxArray *a = mxCreateDoubleMatrix(m, n, mxREAL); \
+      p          = mxGetDoubles(a);                    \
+      for (i = 0; i < m * n; ++i) *p++ = (double)*q++; \
+      return a;                                        \
+    }                                                  \
+  }
+
+#define mxWrapGetScalarZDef(func, T, ZT, setz)                                         \
+  void func(T *z, const mxArray *a) {                                                  \
+    if (mxIsComplex(a)) {                                                              \
+      setz(z, (ZT)(*mxGetComplexDoubles(a)).real, (ZT)(*mxGetComplexDoubles(a)).imag); \
+    } else {                                                                           \
+      setz(z, (ZT)(*mxGetComplexDoubles(a)).real, (ZT)0);                              \
+    }                                                                                  \
+  }
+
+#define mxWrapGetArrayZDef(func, T, ZT, setz)                    \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    double *q;                                                   \
+    mxComplexDouble *z;                                          \
+    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) {               \
+      *e = "Invalid array argument, mxDOUBLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    if (mxIsComplex(a)) {                                        \
+      z = mxGetComplexDoubles(a);                                \
+      for (i = 0; i < arraylen; ++i) {                           \
+        setz(p, (ZT)(*z).real, (ZT)(*z).imag);                   \
+        ++p;                                                     \
+        ++z;                                                     \
+      }                                                          \
+    } else {                                                     \
+      q = mxGetDoubles(a);                                       \
+      for (i = 0; i < arraylen; ++i) {                           \
+        setz(p, (ZT)(*q), (ZT)0);                                \
+        ++p;                                                     \
+        ++q;                                                     \
+      }                                                          \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyZDef(func, T, freal, fimag)     \
+  void func(mxArray *a, const T *q, mwSize n) {   \
+    mwIndex i;                                    \
+    double *p;                                    \
+    mxComplexDouble *z;                           \
+    if (mxIsComplex(a)) {                         \
+      z = mxGetComplexDoubles(a);                 \
+      for (i = 0; i < n; ++i) {                   \
+        (*z).real = freal(*q);                    \
+        (*z).imag = fimag(*q);                    \
+        ++z;                                      \
+        ++q;                                      \
+      }                                           \
+    } else {                                      \
+      p = mxGetDoubles(a);                        \
+      for (i = 0; i < n; ++i) *p++ = freal(*q++); \
+    }                                             \
+  }
+
+#define mxWrapReturnZDef(func, T, freal, fimag)           \
+  mxArray *func(const T *q, mwSize m, mwSize n) {         \
+    mwIndex i;                                            \
+    mxComplexDouble *p;                                   \
+    if (!q) {                                             \
+      return mxCreateDoubleMatrix(0, 0, mxCOMPLEX);       \
+    } else {                                              \
+      mxArray *a = mxCreateDoubleMatrix(m, n, mxCOMPLEX); \
+      p          = mxGetComplexDoubles(a);                \
+      for (i = 0; i < m * n; ++i) {                       \
+        (*p).real = freal(*q);                            \
+        (*p).imag = fimag(*q);                            \
+        ++p;                                              \
+        ++q;                                              \
+      }                                                   \
+      return a;                                           \
+    }                                                     \
+  }
+
+void *mxWrapGetP_single(const mxArray *a, const char *fmt, const char **e) {
+  void *p = NULL;
 #ifdef R2008OO
-    mxArray* ap;
+  mxArray *ap;
 #endif
-    if (mxGetClassID(a) == mxSINGLE_CLASS && mxIsComplex(a) )
-    {
-        if( mxGetM(a)*mxGetN(a) == 1 && (*mxGetComplexSingles(a)).real == 0 )
-        return NULL;
-    }
-    if (mxGetClassID(a) == mxSINGLE_CLASS && !mxIsComplex(a) )
-    {
-        if( mxGetM(a)*mxGetN(a) == 1 && *mxGetSingles(a) == 0)
-        return NULL;
-    }
-    if (mxIsChar(a)) {
-        char pbuf[128];
-        mxGetString(a, pbuf, sizeof(pbuf));
-        sscanf(pbuf, fmt, &p);
-    } 
+  if (mxGetClassID(a) == mxSINGLE_CLASS && mxIsComplex(a)) {
+    if (mxGetM(a) * mxGetN(a) == 1 && (*mxGetComplexSingles(a)).real == 0) return NULL;
+  }
+  if (mxGetClassID(a) == mxSINGLE_CLASS && !mxIsComplex(a)) {
+    if (mxGetM(a) * mxGetN(a) == 1 && *mxGetSingles(a) == 0) return NULL;
+  }
+  if (mxIsChar(a)) {
+    char pbuf[128];
+    mxGetString(a, pbuf, sizeof(pbuf));
+    sscanf(pbuf, fmt, &p);
+  }
 #ifdef R2008OO
-    else if (ap = mxGetProperty(a, 0, "mwptr")) {
-        return mxWrapGetP(ap, fmt, e);
-    }
+  else if (ap = mxGetProperty(a, 0, "mwptr")) {
+    return mxWrapGetP(ap, fmt, e);
+  }
 #endif
-    if (p == 0)
-        *e = "Invalid pointer";
-    return p;
-}
-
-mxArray* mxWrapCreateP_single(void* p, const char* fmt)
-{
-    if (p == 0) {
-        mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL);
-        *mxGetSingles(z) = 0;
-        return z;
-    } else {
-        char pbuf[128];
-        sprintf(pbuf, fmt, p);
-        return mxCreateString(pbuf);
-    }
-}
-
-mxArray* mxWrapStrncpy_single(const char* s)
-{
-    if (s) {
-        return mxCreateString(s);
-    } else {
-        mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL);
-        *mxGetSingles(z) = 0;
-        return z;
-    }
-}
-
-char* mxWrapGetString_single(const mxArray* a, const char** e)
-{
-    char* s;
-    mwSize slen;
-    if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) {
-        *e = "Invalid string argument";
-        return NULL;
-    }
-    slen = mxGetM(a)*mxGetN(a) + 1;
-    s = (char*) mxMalloc(slen);
-    if (mxGetM(a)*mxGetN(a) == 0)
-        *s = 0;
-    else
-        mxGetString(a, s, slen);
-    return s;
-}
-
-
-float mxWrapGetScalar_single(const mxArray* a, const char** e)
-{
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a)*mxGetN(a) != 1) {
-        *e = "Invalid scalar argument";
-        return 0;
-    }
-    if( mxIsComplex(a) )
-      return (float) (*mxGetComplexSingles(a)).real;
-    else
-      return (float) (*mxGetSingles(a));
-}
-
-#define mxWrapGetArrayDef_single(func, T) \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    float* q; \
-    mxComplexSingle* z; \
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \
-        *e = "Invalid array argument, mxSINGLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexSingles(a);	   \
-	for (i = 0; i < arraylen; ++i)		\
-	  *p++ = (T) (*z++).real;			\
-      } \
-    else \
-      {				   \
-	q = mxGetSingles(a);	   \
-	for (i = 0; i < arraylen; ++i)		\
-	  *p++ = (T) (*q++);			\
-      } \
-    return array; \
-}
-
-
-#define mxWrapCopyDef_single(func, T) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p;	\
-    mxComplexSingle* z; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexSingles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  (*z++).real = (float) *q++;	\
-	  (*z++).imag = 0;	\
-      } \
-    else \
-      {				   \
-	p = mxGetSingles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  *p++ = (float) *q++;		\
-      } \
-}
-
-
-#define mxWrapReturnDef_single(func, T) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p; \
-    if (!q) { \
-        return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxREAL); \
-    } else { \
-        mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxREAL); \
-        p = mxGetSingles(a); \
-        for (i = 0; i < m*n; ++i) \
-	  *p++ = (float) *q++;	  \
-        return a; \
-    } \
-}
-
-
-#define mxWrapGetScalarZDef_single(func, T, ZT, setz)	\
-void func(T* z, const mxArray* a) \
-{ \
-    if( mxIsComplex(a) ) \
-      { \
-  setz(z, (ZT) (*mxGetComplexSingles(a)).real, (ZT) (*mxGetComplexSingles(a)).imag); \
-      } \
-    else \
-      {				   \
-  setz(z, (ZT) (*mxGetComplexSingles(a)).real, (ZT) 0);	\
-      } \
-}
-
-
-#define mxWrapGetArrayZDef_single(func, T, ZT, setz)      \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    float* q; \
-    mxComplexSingle* z; \
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \
-        *e = "Invalid array argument, mxSINGLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexSingles(a);	   \
-	for (i = 0; i < arraylen; ++i) {	\
-	  setz(p, (ZT) (*z).real, (ZT) (*z).imag);	\
-  	  ++p; ++z; }					\
-      } \
-    else \
-      {				   \
-	q = mxGetSingles(a);	   \
-	for (i = 0; i < arraylen; ++i)	{	\
-	  setz(p, (ZT) (*q), (ZT) 0 );		\
-          ++p; ++q; }			\
-      }						\
-    return array; \
-}
-
-
-#define mxWrapCopyZDef_single(func, T, freal, fimag)	    \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p;	\
-    mxComplexSingle* z; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexSingles(a);	   \
-	for (i = 0; i < n; ++i)	{		\
-          (*z).real = freal(*q);			\
-	  (*z).imag = fimag(*q);			\
-	  ++z; ++q; 	}			\
-      } \
-    else \
-      {				   \
-	p = mxGetSingles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  *p++ = freal(*q++);		\
-      } \
-}
-
-
-#define mxWrapReturnZDef_single(func, T, freal, fimag)	      \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    mxComplexSingle* p; \
-    if (!q) { \
-        return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxCOMPLEX); \
-    } else { \
-        mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxCOMPLEX); \
-        p = mxGetComplexSingles(a); \
-        for (i = 0; i < m*n; ++i) {	  \
-          (*p).real = freal(*q);			\
-	  (*p).imag = fimag(*q);			\
-	  ++p; ++q; 	}			\
-        return a; \
-    } \
-}
-
-
+  if (p == 0) *e = "Invalid pointer";
+  return p;
+}
+
+mxArray *mxWrapCreateP_single(void *p, const char *fmt) {
+  if (p == 0) {
+    mxArray *z       = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+    *mxGetSingles(z) = 0;
+    return z;
+  } else {
+    char pbuf[128];
+    sprintf(pbuf, fmt, p);
+    return mxCreateString(pbuf);
+  }
+}
+
+mxArray *mxWrapStrncpy_single(const char *s) {
+  if (s) {
+    return mxCreateString(s);
+  } else {
+    mxArray *z       = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+    *mxGetSingles(z) = 0;
+    return z;
+  }
+}
+
+char *mxWrapGetString_single(const mxArray *a, const char **e) {
+  char *s;
+  mwSize slen;
+  if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) {
+    *e = "Invalid string argument";
+    return NULL;
+  }
+  slen = mxGetM(a) * mxGetN(a) + 1;
+  s    = (char *)mxMalloc(slen);
+  if (mxGetM(a) * mxGetN(a) == 0)
+    *s = 0;
+  else
+    mxGetString(a, s, slen);
+  return s;
+}
+
+float mxWrapGetScalar_single(const mxArray *a, const char **e) {
+  if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a) * mxGetN(a) != 1) {
+    *e = "Invalid scalar argument";
+    return 0;
+  }
+  if (mxIsComplex(a))
+    return (float)(*mxGetComplexSingles(a)).real;
+  else
+    return (float)(*mxGetSingles(a));
+}
+
+#define mxWrapGetArrayDef_single(func, T)                        \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    float *q;                                                    \
+    mxComplexSingle *z;                                          \
+    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) {               \
+      *e = "Invalid array argument, mxSINGLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    if (mxIsComplex(a)) {                                        \
+      z = mxGetComplexSingles(a);                                \
+      for (i = 0; i < arraylen; ++i) *p++ = (T)(*z++).real;      \
+    } else {                                                     \
+      q = mxGetSingles(a);                                       \
+      for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++);           \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyDef_single(func, T)                    \
+  void func(mxArray *a, const T *q, mwSize n) {          \
+    mwIndex i;                                           \
+    float *p;                                            \
+    mxComplexSingle *z;                                  \
+    if (mxIsComplex(a)) {                                \
+      z = mxGetComplexSingles(a);                        \
+      for (i = 0; i < n; ++i) (*z++).real = (float)*q++; \
+      (*z++).imag = 0;                                   \
+    } else {                                             \
+      p = mxGetSingles(a);                               \
+      for (i = 0; i < n; ++i) *p++ = (float)*q++;        \
+    }                                                    \
+  }
+
+#define mxWrapReturnDef_single(func, T)                                 \
+  mxArray *func(const T *q, mwSize m, mwSize n) {                       \
+    mwIndex i;                                                          \
+    float *p;                                                           \
+    if (!q) {                                                           \
+      return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxREAL);       \
+    } else {                                                            \
+      mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxREAL); \
+      p          = mxGetSingles(a);                                     \
+      for (i = 0; i < m * n; ++i) *p++ = (float)*q++;                   \
+      return a;                                                         \
+    }                                                                   \
+  }
+
+#define mxWrapGetScalarZDef_single(func, T, ZT, setz)                                  \
+  void func(T *z, const mxArray *a) {                                                  \
+    if (mxIsComplex(a)) {                                                              \
+      setz(z, (ZT)(*mxGetComplexSingles(a)).real, (ZT)(*mxGetComplexSingles(a)).imag); \
+    } else {                                                                           \
+      setz(z, (ZT)(*mxGetComplexSingles(a)).real, (ZT)0);                              \
+    }                                                                                  \
+  }
+
+#define mxWrapGetArrayZDef_single(func, T, ZT, setz)             \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    float *q;                                                    \
+    mxComplexSingle *z;                                          \
+    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) {               \
+      *e = "Invalid array argument, mxSINGLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    if (mxIsComplex(a)) {                                        \
+      z = mxGetComplexSingles(a);                                \
+      for (i = 0; i < arraylen; ++i) {                           \
+        setz(p, (ZT)(*z).real, (ZT)(*z).imag);                   \
+        ++p;                                                     \
+        ++z;                                                     \
+      }                                                          \
+    } else {                                                     \
+      q = mxGetSingles(a);                                       \
+      for (i = 0; i < arraylen; ++i) {                           \
+        setz(p, (ZT)(*q), (ZT)0);                                \
+        ++p;                                                     \
+        ++q;                                                     \
+      }                                                          \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyZDef_single(func, T, freal, fimag) \
+  void func(mxArray *a, const T *q, mwSize n) {      \
+    mwIndex i;                                       \
+    float *p;                                        \
+    mxComplexSingle *z;                              \
+    if (mxIsComplex(a)) {                            \
+      z = mxGetComplexSingles(a);                    \
+      for (i = 0; i < n; ++i) {                      \
+        (*z).real = freal(*q);                       \
+        (*z).imag = fimag(*q);                       \
+        ++z;                                         \
+        ++q;                                         \
+      }                                              \
+    } else {                                         \
+      p = mxGetSingles(a);                           \
+      for (i = 0; i < n; ++i) *p++ = freal(*q++);    \
+    }                                                \
+  }
+
+#define mxWrapReturnZDef_single(func, T, freal, fimag)                     \
+  mxArray *func(const T *q, mwSize m, mwSize n) {                          \
+    mwIndex i;                                                             \
+    mxComplexSingle *p;                                                    \
+    if (!q) {                                                              \
+      return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxCOMPLEX);       \
+    } else {                                                               \
+      mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxCOMPLEX); \
+      p          = mxGetComplexSingles(a);                                 \
+      for (i = 0; i < m * n; ++i) {                                        \
+        (*p).real = freal(*q);                                             \
+        (*p).imag = fimag(*q);                                             \
+        ++p;                                                               \
+        ++q;                                                               \
+      }                                                                    \
+      return a;                                                            \
+    }                                                                      \
+  }
 
 #else
 
@@ -557,1672 +474,1533 @@ mxArray* func(const T* q, mwSize m, mwSize n) \
  * Support routines for copying data into and out of the MEX stubs, -R2017b
  */
 
-void* mxWrapGetP(const mxArray* a, const char* fmt, const char** e)
-{
-    void* p = 0;
-#ifdef R2008OO
-    mxArray* ap;
-#endif
-    if (mxGetClassID(a) == mxDOUBLE_CLASS && 
-        mxGetM(a)*mxGetN(a) == 1 && *mxGetPr(a) == 0)
-        return p;
-    if (mxIsChar(a)) {
-        char pbuf[128];
-        mxGetString(a, pbuf, sizeof(pbuf));
-        sscanf(pbuf, fmt, &p);
-    } 
+void *mxWrapGetP(const mxArray *a, const char *fmt, const char **e) {
+  void *p = 0;
 #ifdef R2008OO
-    else if (ap = mxGetProperty(a, 0, "mwptr")) {
-        return mxWrapGetP(ap, fmt, e);
-    }
+  mxArray *ap;
 #endif
-    if (p == 0)
-        *e = "Invalid pointer";
+  if (mxGetClassID(a) == mxDOUBLE_CLASS && mxGetM(a) * mxGetN(a) == 1 && *mxGetPr(a) == 0)
     return p;
-}
-
-mxArray* mxWrapCreateP(void* p, const char* fmt)
-{
-    if (p == 0) {
-        mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL);
-        *mxGetPr(z) = 0;
-        return z;
-    } else {
-        char pbuf[128];
-        sprintf(pbuf, fmt, p);
-        return mxCreateString(pbuf);
-    }
-}
-
-mxArray* mxWrapStrncpy(const char* s)
-{
-    if (s) {
-        return mxCreateString(s);
-    } else {
-        mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL);
-        *mxGetPr(z) = 0;
-        return z;
-    }
-}
-
-double mxWrapGetScalar(const mxArray* a, const char** e)
-{
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a)*mxGetN(a) != 1) {
-        *e = "Invalid scalar argument";
-        return 0;
-    }
-    return *mxGetPr(a);
-}
-
-char* mxWrapGetString(const mxArray* a, const char** e)
-{
-    char* s;
-    mwSize slen;
-    if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) {
-        *e = "Invalid string argument";
-        return NULL;
-    }
-    slen = mxGetM(a)*mxGetN(a) + 1;
-    s = (char*) mxMalloc(slen);
-    if (mxGetM(a)*mxGetN(a) == 0)
-        *s = 0;
-    else
-        mxGetString(a, s, slen);
-    return s;
-}
-
-
-#define mxWrapGetArrayDef(func, T) \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    double* q; \
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \
-        *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    q = mxGetPr(a); \
-    for (i = 0; i < arraylen; ++i) \
-        *p++ = (T) (*q++); \
-    return array; \
-}
-
-
-#define mxWrapCopyDef(func, T) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p = mxGetPr(a); \
-    for (i = 0; i < n; ++i) \
-        *p++ = *q++; \
-}
-
-
-#define mxWrapReturnDef(func, T) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p; \
-    if (!q) { \
-        return mxCreateDoubleMatrix(0,0, mxREAL); \
-    } else { \
-        mxArray* a = mxCreateDoubleMatrix(m,n, mxREAL); \
-        p = mxGetPr(a); \
-        for (i = 0; i < m*n; ++i) \
-            *p++ = *q++; \
-        return a; \
-    } \
-}
-
-
-#define mxWrapGetScalarZDef(func, T, ZT, setz) \
-void func(T* z, const mxArray* a) \
-{ \
-    double* pr = mxGetPr(a); \
-    double* pi = mxGetPi(a); \
-    setz(z, (ZT) *pr, (pi ? (ZT) *pi : (ZT) 0)); \
-}
-
-
-#define mxWrapGetArrayZDef(func, T, ZT, setz) \
-T* func(const mxArray* a, const char** e) \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    double* qr; \
-    double* qi; \
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \
-        *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    qr = mxGetPr(a); \
-    qi = mxGetPi(a); \
-    for (i = 0; i < arraylen; ++i) { \
-        ZT val_qr = *qr++; \
-        ZT val_qi = (qi ? (ZT) *qi++ : (ZT) 0); \
-        setz(p, val_qr, val_qi); \
-        ++p; \
-    } \
-    return array; \
-}
-
-
-#define mxWrapCopyZDef(func, T, real, imag) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    double* pr = mxGetPr(a); \
-    double* pi = mxGetPi(a); \
-    for (i = 0; i < n; ++i) { \
-        *pr++ = real(*q); \
-        *pi++ = imag(*q); \
-        ++q; \
-    } \
-}
-
-
-#define mxWrapReturnZDef(func, T, real, imag) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    double* pr; \
-    double* pi; \
-    if (!q) { \
-        return mxCreateDoubleMatrix(0,0, mxCOMPLEX); \
-    } else { \
-        mxArray* a = mxCreateDoubleMatrix(m,n, mxCOMPLEX); \
-        pr = mxGetPr(a); \
-        pi = mxGetPi(a); \
-        for (i = 0; i < m*n; ++i) { \
-            *pr++ = real(*q); \
-            *pi++ = imag(*q); \
-            ++q; \
-        } \
-        return a; \
-    } \
-}
-
-
-
-
-
-
-void* mxWrapGetP_single(const mxArray* a, const char* fmt, const char** e)
-{
-    void* p = 0;
+  if (mxIsChar(a)) {
+    char pbuf[128];
+    mxGetString(a, pbuf, sizeof(pbuf));
+    sscanf(pbuf, fmt, &p);
+  }
 #ifdef R2008OO
-    mxArray* ap;
+  else if (ap = mxGetProperty(a, 0, "mwptr")) {
+    return mxWrapGetP(ap, fmt, e);
+  }
 #endif
-    if (mxGetClassID(a) == mxSINGLE_CLASS && 
-        mxGetM(a)*mxGetN(a) == 1 && *((float*)mxGetData(a)) == 0)
-        return p;
-    if (mxIsChar(a)) {
-        char pbuf[128];
-        mxGetString(a, pbuf, sizeof(pbuf));
-        sscanf(pbuf, fmt, &p);
-    } 
+  if (p == 0) *e = "Invalid pointer";
+  return p;
+}
+
+mxArray *mxWrapCreateP(void *p, const char *fmt) {
+  if (p == 0) {
+    mxArray *z  = mxCreateDoubleMatrix(1, 1, mxREAL);
+    *mxGetPr(z) = 0;
+    return z;
+  } else {
+    char pbuf[128];
+    sprintf(pbuf, fmt, p);
+    return mxCreateString(pbuf);
+  }
+}
+
+mxArray *mxWrapStrncpy(const char *s) {
+  if (s) {
+    return mxCreateString(s);
+  } else {
+    mxArray *z  = mxCreateDoubleMatrix(1, 1, mxREAL);
+    *mxGetPr(z) = 0;
+    return z;
+  }
+}
+
+double mxWrapGetScalar(const mxArray *a, const char **e) {
+  if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a) * mxGetN(a) != 1) {
+    *e = "Invalid scalar argument";
+    return 0;
+  }
+  return *mxGetPr(a);
+}
+
+char *mxWrapGetString(const mxArray *a, const char **e) {
+  char *s;
+  mwSize slen;
+  if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) {
+    *e = "Invalid string argument";
+    return NULL;
+  }
+  slen = mxGetM(a) * mxGetN(a) + 1;
+  s    = (char *)mxMalloc(slen);
+  if (mxGetM(a) * mxGetN(a) == 0)
+    *s = 0;
+  else
+    mxGetString(a, s, slen);
+  return s;
+}
+
+#define mxWrapGetArrayDef(func, T)                               \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    double *q;                                                   \
+    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) {               \
+      *e = "Invalid array argument, mxDOUBLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    q        = mxGetPr(a);                                       \
+    for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++);             \
+    return array;                                                \
+  }
+
+#define mxWrapCopyDef(func, T)                  \
+  void func(mxArray *a, const T *q, mwSize n) { \
+    mwIndex i;                                  \
+    double *p = mxGetPr(a);                     \
+    for (i = 0; i < n; ++i) *p++ = *q++;        \
+  }
+
+#define mxWrapReturnDef(func, T)                       \
+  mxArray *func(const T *q, mwSize m, mwSize n) {      \
+    mwIndex i;                                         \
+    double *p;                                         \
+    if (!q) {                                          \
+      return mxCreateDoubleMatrix(0, 0, mxREAL);       \
+    } else {                                           \
+      mxArray *a = mxCreateDoubleMatrix(m, n, mxREAL); \
+      p          = mxGetPr(a);                         \
+      for (i = 0; i < m * n; ++i) *p++ = *q++;         \
+      return a;                                        \
+    }                                                  \
+  }
+
+#define mxWrapGetScalarZDef(func, T, ZT, setz)    \
+  void func(T *z, const mxArray *a) {             \
+    double *pr = mxGetPr(a);                      \
+    double *pi = mxGetPi(a);                      \
+    setz(z, (ZT) * pr, (pi ? (ZT) * pi : (ZT)0)); \
+  }
+
+#define mxWrapGetArrayZDef(func, T, ZT, setz)                    \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    double *qr;                                                  \
+    double *qi;                                                  \
+    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) {               \
+      *e = "Invalid array argument, mxDOUBLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    qr       = mxGetPr(a);                                       \
+    qi       = mxGetPi(a);                                       \
+    for (i = 0; i < arraylen; ++i) {                             \
+      ZT val_qr = *qr++;                                         \
+      ZT val_qi = (qi ? (ZT) * qi++ : (ZT)0);                    \
+      setz(p, val_qr, val_qi);                                   \
+      ++p;                                                       \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyZDef(func, T, real, imag)     \
+  void func(mxArray *a, const T *q, mwSize n) { \
+    mwIndex i;                                  \
+    double *pr = mxGetPr(a);                    \
+    double *pi = mxGetPi(a);                    \
+    for (i = 0; i < n; ++i) {                   \
+      *pr++ = real(*q);                         \
+      *pi++ = imag(*q);                         \
+      ++q;                                      \
+    }                                           \
+  }
+
+#define mxWrapReturnZDef(func, T, real, imag)             \
+  mxArray *func(const T *q, mwSize m, mwSize n) {         \
+    mwIndex i;                                            \
+    double *pr;                                           \
+    double *pi;                                           \
+    if (!q) {                                             \
+      return mxCreateDoubleMatrix(0, 0, mxCOMPLEX);       \
+    } else {                                              \
+      mxArray *a = mxCreateDoubleMatrix(m, n, mxCOMPLEX); \
+      pr         = mxGetPr(a);                            \
+      pi         = mxGetPi(a);                            \
+      for (i = 0; i < m * n; ++i) {                       \
+        *pr++ = real(*q);                                 \
+        *pi++ = imag(*q);                                 \
+        ++q;                                              \
+      }                                                   \
+      return a;                                           \
+    }                                                     \
+  }
+
+void *mxWrapGetP_single(const mxArray *a, const char *fmt, const char **e) {
+  void *p = 0;
 #ifdef R2008OO
-    else if (ap = mxGetProperty(a, 0, "mwptr")) {
-        return mxWrapGetP(ap, fmt, e);
-    }
+  mxArray *ap;
 #endif
-    if (p == 0)
-        *e = "Invalid pointer";
+  if (mxGetClassID(a) == mxSINGLE_CLASS && mxGetM(a) * mxGetN(a) == 1 &&
+      *((float *)mxGetData(a)) == 0)
     return p;
-}
-
-mxArray* mxWrapCreateP_single(void* p, const char* fmt)
-{
-    if (p == 0) {
-        mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL);
-        *((float*)mxGetData(z)) = 0;
-        return z;
-    } else {
-        char pbuf[128];
-        sprintf(pbuf, fmt, p);
-        return mxCreateString(pbuf);
-    }
-}
-mxArray* mxWrapStrncpy_single(const char* s)
-{
-    if (s) {
-        return mxCreateString(s);
-    } else {
-        mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL);
-        *((float*)mxGetData(z)) = 0;
-        return z;
-    }
-}
-
-float mxWrapGetScalar_single(const mxArray* a, const char** e)
-{
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a)*mxGetN(a) != 1) {
-        *e = "Invalid scalar argument";
-        return 0;
-    }
-    return *((float*)mxGetData(a));
-}
-
-char* mxWrapGetString_single(const mxArray* a, const char** e)
-{
-    char* s;
-    mwSize slen;
-    if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) {
-        *e = "Invalid string argument, mxSINGLE_CLASS expected";
-        return NULL;
-    }
-    slen = mxGetM(a)*mxGetN(a) + 1;
-    s = (char*) mxMalloc(slen);
-    if (mxGetM(a)*mxGetN(a) == 0)
-        *s = 0;
-    else
-        mxGetString(a, s, slen);
-    return s;
-}
-
-
-#define mxWrapGetArrayDef_single(func, T) \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    float* q; \
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \
-        *e = "Invalid array argument, mxSINGLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    q = (float*) mxGetData(a);	   \
-    for (i = 0; i < arraylen; ++i) \
-        *p++ = (T) (*q++); \
-    return array; \
-}
-
-
-#define mxWrapCopyDef_single(func, T) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p = (float*) mxGetData(a);		\
-    for (i = 0; i < n; ++i) \
-        *p++ = *q++; \
-}
-
-
-#define mxWrapReturnDef_single(func, T) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p; \
-    if (!q) { \
-      return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxREAL); \
-    } else { \
-        mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxREAL);\
-        p = (float*) mxGetData(a);				\
-        for (i = 0; i < m*n; ++i) \
-            *p++ = *q++; \
-        return a; \
-    } \
-}
-
+  if (mxIsChar(a)) {
+    char pbuf[128];
+    mxGetString(a, pbuf, sizeof(pbuf));
+    sscanf(pbuf, fmt, &p);
+  }
+#ifdef R2008OO
+  else if (ap = mxGetProperty(a, 0, "mwptr")) {
+    return mxWrapGetP(ap, fmt, e);
+  }
+#endif
+  if (p == 0) *e = "Invalid pointer";
+  return p;
+}
+
+mxArray *mxWrapCreateP_single(void *p, const char *fmt) {
+  if (p == 0) {
+    mxArray *z               = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+    *((float *)mxGetData(z)) = 0;
+    return z;
+  } else {
+    char pbuf[128];
+    sprintf(pbuf, fmt, p);
+    return mxCreateString(pbuf);
+  }
+}
+mxArray *mxWrapStrncpy_single(const char *s) {
+  if (s) {
+    return mxCreateString(s);
+  } else {
+    mxArray *z               = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+    *((float *)mxGetData(z)) = 0;
+    return z;
+  }
+}
+
+float mxWrapGetScalar_single(const mxArray *a, const char **e) {
+  if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a) * mxGetN(a) != 1) {
+    *e = "Invalid scalar argument";
+    return 0;
+  }
+  return *((float *)mxGetData(a));
+}
+
+char *mxWrapGetString_single(const mxArray *a, const char **e) {
+  char *s;
+  mwSize slen;
+  if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) {
+    *e = "Invalid string argument, mxSINGLE_CLASS expected";
+    return NULL;
+  }
+  slen = mxGetM(a) * mxGetN(a) + 1;
+  s    = (char *)mxMalloc(slen);
+  if (mxGetM(a) * mxGetN(a) == 0)
+    *s = 0;
+  else
+    mxGetString(a, s, slen);
+  return s;
+}
+
+#define mxWrapGetArrayDef_single(func, T)                        \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    float *q;                                                    \
+    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) {               \
+      *e = "Invalid array argument, mxSINGLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    q        = (float *)mxGetData(a);                            \
+    for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++);             \
+    return array;                                                \
+  }
+
+#define mxWrapCopyDef_single(func, T)           \
+  void func(mxArray *a, const T *q, mwSize n) { \
+    mwIndex i;                                  \
+    float *p = (float *)mxGetData(a);           \
+    for (i = 0; i < n; ++i) *p++ = *q++;        \
+  }
+
+#define mxWrapReturnDef_single(func, T)                                 \
+  mxArray *func(const T *q, mwSize m, mwSize n) {                       \
+    mwIndex i;                                                          \
+    float *p;                                                           \
+    if (!q) {                                                           \
+      return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxREAL);       \
+    } else {                                                            \
+      mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxREAL); \
+      p          = (float *)mxGetData(a);                               \
+      for (i = 0; i < m * n; ++i) *p++ = *q++;                          \
+      return a;                                                         \
+    }                                                                   \
+  }
 
 #define mxWrapGetScalarZDef_single(func, T, ZT, setz) \
-void func(T* z, const mxArray* a) \
-{ \
-    float* pr = (float*) mxGetData(a);		\
-    float* pi = (float*) mxGetImagData(a);		 \
-    setz(z, (ZT) *pr, (pi ? (ZT) *pi : (ZT) 0)); \
-}
-
-
-#define mxWrapGetArrayZDef_single(func, T, ZT, setz) \
-T* func(const mxArray* a, const char** e) \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    float* qr; \
-    float* qi; \
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \
-        *e = "Invalid array argument, mxSINGLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    qr = (float*) mxGetData(a);			\
-    qi = (float*) mxGetImagData(a);			\
-    for (i = 0; i < arraylen; ++i) { \
-        ZT val_qr = *qr++; \
-        ZT val_qi = (qi ? (ZT) *qi++ : (ZT) 0); \
-        setz(p, val_qr, val_qi); \
-        ++p; \
-    } \
-    return array; \
-}
-
+  void func(T *z, const mxArray *a) {                 \
+    float *pr = (float *)mxGetData(a);                \
+    float *pi = (float *)mxGetImagData(a);            \
+    setz(z, (ZT) * pr, (pi ? (ZT) * pi : (ZT)0));     \
+  }
+
+#define mxWrapGetArrayZDef_single(func, T, ZT, setz)             \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    float *qr;                                                   \
+    float *qi;                                                   \
+    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) {               \
+      *e = "Invalid array argument, mxSINGLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    qr       = (float *)mxGetData(a);                            \
+    qi       = (float *)mxGetImagData(a);                        \
+    for (i = 0; i < arraylen; ++i) {                             \
+      ZT val_qr = *qr++;                                         \
+      ZT val_qi = (qi ? (ZT) * qi++ : (ZT)0);                    \
+      setz(p, val_qr, val_qi);                                   \
+      ++p;                                                       \
+    }                                                            \
+    return array;                                                \
+  }
 
 #define mxWrapCopyZDef_single(func, T, real, imag) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    float* pr = (float*) mxGetData(a);		\
-    float* pi = (float*) mxGetImagData(a);		\
-    for (i = 0; i < n; ++i) { \
-        *pr++ = real(*q); \
-        *pi++ = imag(*q); \
-        ++q; \
-    } \
-}
-
-
-#define mxWrapReturnZDef_single(func, T, real, imag) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    float* pr; \
-    float* pi; \
-    if (!q) { \
-      return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxCOMPLEX); \
-    } else { \
-        mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxCOMPLEX);\
-        pr = (float*) mxGetData(a);					\
-        pi = (float*) mxGetImagData(a);					\
-        for (i = 0; i < m*n; ++i) { \
-            *pr++ = real(*q); \
-            *pi++ = imag(*q); \
-            ++q; \
-        } \
-        return a; \
-    } \
-}
-
-
-
-
+  void func(mxArray *a, const T *q, mwSize n) {    \
+    mwIndex i;                                     \
+    float *pr = (float *)mxGetData(a);             \
+    float *pi = (float *)mxGetImagData(a);         \
+    for (i = 0; i < n; ++i) {                      \
+      *pr++ = real(*q);                            \
+      *pi++ = imag(*q);                            \
+      ++q;                                         \
+    }                                              \
+  }
+
+#define mxWrapReturnZDef_single(func, T, real, imag)                       \
+  mxArray *func(const T *q, mwSize m, mwSize n) {                          \
+    mwIndex i;                                                             \
+    float *pr;                                                             \
+    float *pi;                                                             \
+    if (!q) {                                                              \
+      return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxCOMPLEX);       \
+    } else {                                                               \
+      mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxCOMPLEX); \
+      pr         = (float *)mxGetData(a);                                  \
+      pi         = (float *)mxGetImagData(a);                              \
+      for (i = 0; i < m * n; ++i) {                                        \
+        *pr++ = real(*q);                                                  \
+        *pi++ = imag(*q);                                                  \
+        ++q;                                                               \
+      }                                                                    \
+      return a;                                                            \
+    }                                                                      \
+  }
 
 #endif
 
 #include <complex>
 
 typedef std::complex<double> dcomplex;
-#define real_dcomplex(z) std::real(z)
-#define imag_dcomplex(z) std::imag(z)
-#define setz_dcomplex(z,r,i)  *z = dcomplex(r,i)
+#define real_dcomplex(z)       std::real(z)
+#define imag_dcomplex(z)       std::imag(z)
+#define setz_dcomplex(z, r, i) *z = dcomplex(r, i)
 
 typedef std::complex<float> fcomplex;
-#define real_fcomplex(z) std::real(z)
-#define imag_fcomplex(z) std::imag(z)
-#define setz_fcomplex(z,r,i)  *z = fcomplex(r,i)
-
- #include <finufft.h>
- #include <mex.h>
- #include <iostream>
- #include <cstring>
- #include <math.h>
- void copy_finufft_opts(const mxArray* om, finufft_opts *oc) {
-   if(!mxIsStruct(om))
-     mexErrMsgIdAndTxt("FINUFFT:inputNotStruct","opts input must be a structure.");
-   mwIndex idx = 0;
-   int ifield, nfields;
-   const char **fname;
-   nfields = mxGetNumberOfFields(om);
-   fname = (const char**)mxCalloc(nfields, sizeof(*fname));
-   for(ifield=0; ifield<nfields; ifield++) {
-     fname[ifield] = mxGetFieldNameByNumber(om,ifield);
-     if (strcmp(fname[ifield],"debug") == 0) {
-       oc->debug = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_debug") == 0) {
-       oc->spread_debug = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_sort") == 0) {
-       oc->spread_sort = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_kerevalmeth") == 0) {
-       oc->spread_kerevalmeth = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_kerpad") == 0) {
-       oc->spread_kerpad = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"fftw") == 0) {
-       oc->fftw = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"modeord") == 0) {
-       oc->modeord = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"upsampfac") == 0) {
-       oc->upsampfac = (double)*mxGetPr(mxGetFieldByNumber(om,idx,ifield));
-     }
-     else if (strcmp(fname[ifield],"spread_thread") == 0) {
-       oc->spread_thread = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"maxbatchsize") == 0) {
-       oc->maxbatchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"nthreads") == 0) {
-       oc->nthreads = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_nthr_atomic") == 0) {
-       oc->spread_nthr_atomic = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_max_sp_size") == 0) {
-       oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else
-       continue;
-   }
-   mxFree(fname);
- }
- void finufft_mex_setup() {
-   /* Forces MATLAB to properly initialize their FFTW library. */
-   mexEvalString("fft(1:8);");
- }
-
-
+#define real_fcomplex(z)       std::real(z)
+#define imag_fcomplex(z)       std::imag(z)
+#define setz_fcomplex(z, r, i) *z = fcomplex(r, i)
+
+#include <cstring>
+#include <finufft.h>
+#include <iostream>
+#include <math.h>
+#include <mex.h>
+void copy_finufft_opts(const mxArray *om, finufft_opts *oc) {
+  if (!mxIsStruct(om))
+    mexErrMsgIdAndTxt("FINUFFT:inputNotStruct", "opts input must be a structure.");
+  mwIndex idx = 0;
+  int ifield, nfields;
+  const char **fname;
+  nfields = mxGetNumberOfFields(om);
+  fname   = (const char **)mxCalloc(nfields, sizeof(*fname));
+  for (ifield = 0; ifield < nfields; ifield++) {
+    fname[ifield] = mxGetFieldNameByNumber(om, ifield);
+    if (strcmp(fname[ifield], "debug") == 0) {
+      oc->debug = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_debug") == 0) {
+      oc->spread_debug = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_sort") == 0) {
+      oc->spread_sort = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_kerevalmeth") == 0) {
+      oc->spread_kerevalmeth = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_kerpad") == 0) {
+      oc->spread_kerpad = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "fftw") == 0) {
+      oc->fftw = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "modeord") == 0) {
+      oc->modeord = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "upsampfac") == 0) {
+      oc->upsampfac = (double)*mxGetPr(mxGetFieldByNumber(om, idx, ifield));
+    } else if (strcmp(fname[ifield], "spread_thread") == 0) {
+      oc->spread_thread = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "maxbatchsize") == 0) {
+      oc->maxbatchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "nthreads") == 0) {
+      oc->nthreads = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_nthr_atomic") == 0) {
+      oc->spread_nthr_atomic = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_max_sp_size") == 0) {
+      oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else
+      continue;
+  }
+  mxFree(fname);
+}
+void finufft_mex_setup() {
+  /* Forces MATLAB to properly initialize their FFTW library. */
+  mexEvalString("fft(1:8);");
+}
 
 /* Array copier definitions */
-mxWrapGetArrayDef(mxWrapGetArray_bool, bool)
-mxWrapCopyDef    (mxWrapCopy_bool,     bool)
-mxWrapReturnDef  (mxWrapReturn_bool,   bool)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_bool, bool)
-mxWrapCopyDef_single    (mxWrapCopy_single_bool,     bool)
-mxWrapReturnDef_single  (mxWrapReturn_single_bool,   bool)
-mxWrapGetArrayDef(mxWrapGetArray_char, char)
-mxWrapCopyDef    (mxWrapCopy_char,     char)
-mxWrapReturnDef  (mxWrapReturn_char,   char)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_char, char)
-mxWrapCopyDef_single    (mxWrapCopy_single_char,     char)
-mxWrapReturnDef_single  (mxWrapReturn_single_char,   char)
-mxWrapGetArrayDef(mxWrapGetArray_double, double)
-mxWrapCopyDef    (mxWrapCopy_double,     double)
-mxWrapReturnDef  (mxWrapReturn_double,   double)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_double, double)
-mxWrapCopyDef_single    (mxWrapCopy_single_double,     double)
-mxWrapReturnDef_single  (mxWrapReturn_single_double,   double)
-mxWrapGetArrayDef(mxWrapGetArray_float, float)
-mxWrapCopyDef    (mxWrapCopy_float,     float)
-mxWrapReturnDef  (mxWrapReturn_float,   float)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_float, float)
-mxWrapCopyDef_single    (mxWrapCopy_single_float,     float)
-mxWrapReturnDef_single  (mxWrapReturn_single_float,   float)
-mxWrapGetArrayDef(mxWrapGetArray_int, int)
-mxWrapCopyDef    (mxWrapCopy_int,     int)
-mxWrapReturnDef  (mxWrapReturn_int,   int)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_int, int)
-mxWrapCopyDef_single    (mxWrapCopy_single_int,     int)
-mxWrapReturnDef_single  (mxWrapReturn_single_int,   int)
-mxWrapGetArrayDef(mxWrapGetArray_int64_t, int64_t)
-mxWrapCopyDef    (mxWrapCopy_int64_t,     int64_t)
-mxWrapReturnDef  (mxWrapReturn_int64_t,   int64_t)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_int64_t, int64_t)
-mxWrapCopyDef_single    (mxWrapCopy_single_int64_t,     int64_t)
-mxWrapReturnDef_single  (mxWrapReturn_single_int64_t,   int64_t)
-mxWrapGetArrayDef(mxWrapGetArray_long, long)
-mxWrapCopyDef    (mxWrapCopy_long,     long)
-mxWrapReturnDef  (mxWrapReturn_long,   long)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_long, long)
-mxWrapCopyDef_single    (mxWrapCopy_single_long,     long)
-mxWrapReturnDef_single  (mxWrapReturn_single_long,   long)
-mxWrapGetArrayDef(mxWrapGetArray_ptrdiff_t, ptrdiff_t)
-mxWrapCopyDef    (mxWrapCopy_ptrdiff_t,     ptrdiff_t)
-mxWrapReturnDef  (mxWrapReturn_ptrdiff_t,   ptrdiff_t)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_ptrdiff_t, ptrdiff_t)
-mxWrapCopyDef_single    (mxWrapCopy_single_ptrdiff_t,     ptrdiff_t)
-mxWrapReturnDef_single  (mxWrapReturn_single_ptrdiff_t,   ptrdiff_t)
-mxWrapGetArrayDef(mxWrapGetArray_short, short)
-mxWrapCopyDef    (mxWrapCopy_short,     short)
-mxWrapReturnDef  (mxWrapReturn_short,   short)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_short, short)
-mxWrapCopyDef_single    (mxWrapCopy_single_short,     short)
-mxWrapReturnDef_single  (mxWrapReturn_single_short,   short)
-mxWrapGetArrayDef(mxWrapGetArray_size_t, size_t)
-mxWrapCopyDef    (mxWrapCopy_size_t,     size_t)
-mxWrapReturnDef  (mxWrapReturn_size_t,   size_t)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_size_t, size_t)
-mxWrapCopyDef_single    (mxWrapCopy_single_size_t,     size_t)
-mxWrapReturnDef_single  (mxWrapReturn_single_size_t,   size_t)
-mxWrapGetScalarZDef(mxWrapGetScalar_fcomplex, fcomplex,
-                    float, setz_fcomplex)
-mxWrapGetArrayZDef (mxWrapGetArray_fcomplex, fcomplex,
-                    float, setz_fcomplex)
-mxWrapCopyZDef     (mxWrapCopy_fcomplex, fcomplex,
-                    real_fcomplex, imag_fcomplex)
-mxWrapReturnZDef   (mxWrapReturn_fcomplex, fcomplex,
-                    real_fcomplex, imag_fcomplex)
-mxWrapGetScalarZDef_single(mxWrapGetScalar_single_fcomplex, fcomplex,
-                    float, setz_fcomplex)
-mxWrapGetArrayZDef_single (mxWrapGetArray_single_fcomplex, fcomplex,
-                    float, setz_fcomplex)
-mxWrapCopyZDef_single     (mxWrapCopy_single_fcomplex, fcomplex,
-                    real_fcomplex, imag_fcomplex)
-mxWrapReturnZDef_single   (mxWrapReturn_single_fcomplex, fcomplex,
-                    real_fcomplex, imag_fcomplex)
-mxWrapGetScalarZDef(mxWrapGetScalar_dcomplex, dcomplex,
-                    double, setz_dcomplex)
-mxWrapGetArrayZDef (mxWrapGetArray_dcomplex, dcomplex,
-                    double, setz_dcomplex)
-mxWrapCopyZDef     (mxWrapCopy_dcomplex, dcomplex,
-                    real_dcomplex, imag_dcomplex)
-mxWrapReturnZDef   (mxWrapReturn_dcomplex, dcomplex,
-                    real_dcomplex, imag_dcomplex)
-mxWrapGetScalarZDef_single(mxWrapGetScalar_single_dcomplex, dcomplex,
-                    double, setz_dcomplex)
-mxWrapGetArrayZDef_single (mxWrapGetArray_single_dcomplex, dcomplex,
-                    double, setz_dcomplex)
-mxWrapCopyZDef_single     (mxWrapCopy_single_dcomplex, dcomplex,
-                    real_dcomplex, imag_dcomplex)
-mxWrapReturnZDef_single   (mxWrapReturn_single_dcomplex, dcomplex,
-                    real_dcomplex, imag_dcomplex)
-
-/* ---- finufft.mw: 166 ----
- * finufft_mex_setup();
- */
-static const char* stubids1_ = "finufft_mex_setup()";
-
-void mexStub1(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    if (mexprofrecord_)
-        mexprofrecord_[1]++;
-    finufft_mex_setup();
+mxWrapGetArrayDef(mxWrapGetArray_bool, bool) mxWrapCopyDef(mxWrapCopy_bool, bool) mxWrapReturnDef(
+    mxWrapReturn_bool,
+    bool) mxWrapGetArrayDef_single(mxWrapGetArray_single_bool,
+                                   bool) mxWrapCopyDef_single(mxWrapCopy_single_bool,
+                                                              bool) mxWrapReturnDef_single(mxWrapReturn_single_bool,
+                                                                                           bool)
+    mxWrapGetArrayDef(mxWrapGetArray_char, char) mxWrapCopyDef(mxWrapCopy_char, char) mxWrapReturnDef(
+        mxWrapReturn_char,
+        char) mxWrapGetArrayDef_single(mxWrapGetArray_single_char,
+                                       char) mxWrapCopyDef_single(mxWrapCopy_single_char,
+                                                                  char)
+        mxWrapReturnDef_single(mxWrapReturn_single_char, char) mxWrapGetArrayDef(
+            mxWrapGetArray_double,
+            double) mxWrapCopyDef(mxWrapCopy_double,
+                                  double) mxWrapReturnDef(mxWrapReturn_double,
+                                                          double) mxWrapGetArrayDef_single(mxWrapGetArray_single_double,
+                                                                                           double)
+            mxWrapCopyDef_single(mxWrapCopy_single_double, double) mxWrapReturnDef_single(
+                mxWrapReturn_single_double,
+                double) mxWrapGetArrayDef(mxWrapGetArray_float,
+                                          float) mxWrapCopyDef(mxWrapCopy_float,
+                                                               float) mxWrapReturnDef(mxWrapReturn_float,
+                                                                                      float)
+                mxWrapGetArrayDef_single(mxWrapGetArray_single_float, float) mxWrapCopyDef_single(
+                    mxWrapCopy_single_float,
+                    float) mxWrapReturnDef_single(mxWrapReturn_single_float,
+                                                  float) mxWrapGetArrayDef(mxWrapGetArray_int,
+                                                                           int)
+                    mxWrapCopyDef(mxWrapCopy_int, int) mxWrapReturnDef(mxWrapReturn_int, int) mxWrapGetArrayDef_single(
+                        mxWrapGetArray_single_int,
+                        int) mxWrapCopyDef_single(mxWrapCopy_single_int,
+                                                  int) mxWrapReturnDef_single(mxWrapReturn_single_int,
+                                                                              int) mxWrapGetArrayDef(mxWrapGetArray_int64_t,
+                                                                                                     int64_t)
+                        mxWrapCopyDef(mxWrapCopy_int64_t, int64_t) mxWrapReturnDef(mxWrapReturn_int64_t, int64_t) mxWrapGetArrayDef_single(
+                            mxWrapGetArray_single_int64_t,
+                            int64_t) mxWrapCopyDef_single(mxWrapCopy_single_int64_t,
+                                                          int64_t) mxWrapReturnDef_single(mxWrapReturn_single_int64_t,
+                                                                                          int64_t)
+                            mxWrapGetArrayDef(mxWrapGetArray_long, long) mxWrapCopyDef(mxWrapCopy_long, long) mxWrapReturnDef(
+                                mxWrapReturn_long,
+                                long) mxWrapGetArrayDef_single(mxWrapGetArray_single_long,
+                                                               long) mxWrapCopyDef_single(mxWrapCopy_single_long,
+                                                                                          long)
+                                mxWrapReturnDef_single(mxWrapReturn_single_long, long) mxWrapGetArrayDef(
+                                    mxWrapGetArray_ptrdiff_t,
+                                    ptrdiff_t) mxWrapCopyDef(mxWrapCopy_ptrdiff_t,
+                                                             ptrdiff_t) mxWrapReturnDef(mxWrapReturn_ptrdiff_t, ptrdiff_t)
+                                    mxWrapGetArrayDef_single(mxWrapGetArray_single_ptrdiff_t, ptrdiff_t) mxWrapCopyDef_single(
+                                        mxWrapCopy_single_ptrdiff_t,
+                                        ptrdiff_t) mxWrapReturnDef_single(mxWrapReturn_single_ptrdiff_t,
+                                                                          ptrdiff_t)
+                                        mxWrapGetArrayDef(mxWrapGetArray_short, short) mxWrapCopyDef(
+                                            mxWrapCopy_short,
+                                            short) mxWrapReturnDef(mxWrapReturn_short,
+                                                                   short) mxWrapGetArrayDef_single(mxWrapGetArray_single_short,
+                                                                                                   short)
+                                            mxWrapCopyDef_single(mxWrapCopy_single_short, short) mxWrapReturnDef_single(
+                                                mxWrapReturn_single_short,
+                                                short) mxWrapGetArrayDef(mxWrapGetArray_size_t,
+                                                                         size_t) mxWrapCopyDef(mxWrapCopy_size_t, size_t)
+                                                mxWrapReturnDef(mxWrapReturn_size_t, size_t) mxWrapGetArrayDef_single(
+                                                    mxWrapGetArray_single_size_t,
+                                                    size_t) mxWrapCopyDef_single(mxWrapCopy_single_size_t, size_t)
+                                                    mxWrapReturnDef_single(mxWrapReturn_single_size_t, size_t) mxWrapGetScalarZDef(
+                                                        mxWrapGetScalar_fcomplex,
+                                                        fcomplex, float,
+                                                        setz_fcomplex) mxWrapGetArrayZDef(mxWrapGetArray_fcomplex, fcomplex, float, setz_fcomplex)
+                                                        mxWrapCopyZDef(mxWrapCopy_fcomplex, fcomplex, real_fcomplex, imag_fcomplex) mxWrapReturnZDef(
+                                                            mxWrapReturn_fcomplex,
+                                                            fcomplex, real_fcomplex,
+                                                            imag_fcomplex)
+                                                            mxWrapGetScalarZDef_single(
+                                                                mxWrapGetScalar_single_fcomplex,
+                                                                fcomplex, float,
+                                                                setz_fcomplex) mxWrapGetArrayZDef_single(mxWrapGetArray_single_fcomplex,
+                                                                                                         fcomplex,
+                                                                                                         float, setz_fcomplex)
+                                                                mxWrapCopyZDef_single(
+                                                                    mxWrapCopy_single_fcomplex,
+                                                                    fcomplex,
+                                                                    real_fcomplex,
+                                                                    imag_fcomplex)
+                                                                    mxWrapReturnZDef_single(
+                                                                        mxWrapReturn_single_fcomplex,
+                                                                        fcomplex,
+                                                                        real_fcomplex,
+                                                                        imag_fcomplex) mxWrapGetScalarZDef(mxWrapGetScalar_dcomplex,
+                                                                                                           dcomplex,
+                                                                                                           double,
+                                                                                                           setz_dcomplex)
+                                                                        mxWrapGetArrayZDef(
+                                                                            mxWrapGetArray_dcomplex,
+                                                                            dcomplex,
+                                                                            double,
+                                                                            setz_dcomplex)
+                                                                            mxWrapCopyZDef(
+                                                                                mxWrapCopy_dcomplex,
+                                                                                dcomplex,
+                                                                                real_dcomplex,
+                                                                                imag_dcomplex)
+                                                                                mxWrapReturnZDef(
+                                                                                    mxWrapReturn_dcomplex,
+                                                                                    dcomplex,
+                                                                                    real_dcomplex,
+                                                                                    imag_dcomplex)
+                                                                                    mxWrapGetScalarZDef_single(
+                                                                                        mxWrapGetScalar_single_dcomplex,
+                                                                                        dcomplex,
+                                                                                        double,
+                                                                                        setz_dcomplex)
+                                                                                        mxWrapGetArrayZDef_single(
+                                                                                            mxWrapGetArray_single_dcomplex,
+                                                                                            dcomplex,
+                                                                                            double,
+                                                                                            setz_dcomplex)
+                                                                                            mxWrapCopyZDef_single(
+                                                                                                mxWrapCopy_single_dcomplex,
+                                                                                                dcomplex,
+                                                                                                real_dcomplex,
+                                                                                                imag_dcomplex)
+                                                                                                mxWrapReturnZDef_single(
+                                                                                                    mxWrapReturn_single_dcomplex,
+                                                                                                    dcomplex,
+                                                                                                    real_dcomplex,
+                                                                                                    imag_dcomplex)
+
+    /* ---- finufft.mw: 166 ----
+     * finufft_mex_setup();
+     */
+    static const char *stubids1_ = "finufft_mex_setup()";
+
+void mexStub1(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  if (mexprofrecord_) mexprofrecord_[1]++;
+  finufft_mex_setup();
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 167 ----
  * finufft_opts* o = new();
  */
-static const char* stubids2_ = "o finufft_opts* = new()";
+static const char *stubids2_ = "o finufft_opts* = new()";
 
-void mexStub2(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_opts*  out0_=0; /* o          */
+void mexStub2(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_opts *out0_     = 0; /* o          */
 
-    if (mexprofrecord_)
-        mexprofrecord_[2]++;
-    out0_ = new finufft_opts();
-    plhs[0] = mxWrapCreateP(out0_, "finufft_opts:%p");
+  if (mexprofrecord_) mexprofrecord_[2]++;
+  out0_   = new finufft_opts();
+  plhs[0] = mxWrapCreateP(out0_, "finufft_opts:%p");
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 169 ----
  * finufft_plan* p = new();
  */
-static const char* stubids3_ = "o finufft_plan* = new()";
+static const char *stubids3_ = "o finufft_plan* = new()";
 
-void mexStub3(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  out0_=0; /* p          */
+void mexStub3(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *out0_     = 0; /* p          */
 
-    if (mexprofrecord_)
-        mexprofrecord_[3]++;
-    out0_ = new finufft_plan();
-    plhs[0] = mxWrapCreateP(out0_, "finufft_plan:%p");
+  if (mexprofrecord_) mexprofrecord_[3]++;
+  out0_   = new finufft_plan();
+  plhs[0] = mxWrapCreateP(out0_, "finufft_plan:%p");
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 170 ----
  * finufft_default_opts(finufft_opts* o);
  */
-static const char* stubids4_ = "finufft_default_opts(i finufft_opts*)";
+static const char *stubids4_ = "finufft_default_opts(i finufft_opts*)";
 
-void mexStub4(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_opts*  in0_ =0; /* o          */
+void mexStub4(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_opts *in0_      = 0; /* o          */
 
-    in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[4]++;
-    finufft_default_opts(in0_);
+  in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[4]++;
+  finufft_default_opts(in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 172 ----
  * finufftf_plan* p = new();
  */
-static const char* stubids5_ = "o finufftf_plan* = new()";
+static const char *stubids5_ = "o finufftf_plan* = new()";
 
-void mexStub5(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  out0_=0; /* p          */
+void mexStub5(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *out0_    = 0; /* p          */
 
-    if (mexprofrecord_)
-        mexprofrecord_[5]++;
-    out0_ = new finufftf_plan();
-    plhs[0] = mxWrapCreateP(out0_, "finufftf_plan:%p");
+  if (mexprofrecord_) mexprofrecord_[5]++;
+  out0_   = new finufftf_plan();
+  plhs[0] = mxWrapCreateP(out0_, "finufftf_plan:%p");
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 173 ----
  * finufftf_default_opts(finufft_opts* o);
  */
-static const char* stubids6_ = "finufftf_default_opts(i finufft_opts*)";
+static const char *stubids6_ = "finufftf_default_opts(i finufft_opts*)";
 
-void mexStub6(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_opts*  in0_ =0; /* o          */
+void mexStub6(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_opts *in0_      = 0; /* o          */
 
-    in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[6]++;
-    finufftf_default_opts(in0_);
+  in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[6]++;
+  finufftf_default_opts(in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 184 ----
  * copy_finufft_opts(mxArray opts, finufft_opts* o);
  */
-static const char* stubids7_ = "copy_finufft_opts(i mxArray, i finufft_opts*)";
-
-void mexStub7(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    const mxArray*  in0_;    /* opts       */
-    finufft_opts*  in1_ =0; /* o          */
-
-    in0_ = prhs[0];
-    in1_ = (finufft_opts*) mxWrapGetP(prhs[1], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[7]++;
-    copy_finufft_opts(in0_, in1_);
+static const char *stubids7_ = "copy_finufft_opts(i mxArray, i finufft_opts*)";
+
+void mexStub7(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  const mxArray *in0_;    /* opts       */
+  finufft_opts *in1_ = 0; /* o          */
+
+  in0_ = prhs[0];
+  in1_ = (finufft_opts *)mxWrapGetP(prhs[1], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[7]++;
+  copy_finufft_opts(in0_, in1_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 187 ----
- * int ier = finufft_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, double tol, finufft_plan* plan, finufft_opts* o);
+ * int ier = finufft_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int
+ * n_trans, double tol, finufft_plan* plan, finufft_opts* o);
  */
-static const char* stubids8_ = "o int = finufft_makeplan(i int, i int, i int64_t[x], i int, i int, i double, i finufft_plan*, i finufft_opts*)";
-
-void mexStub8(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    int         in0_;    /* type       */
-    int         in1_;    /* dim        */
-    int64_t*    in2_ =0; /* n_modes    */
-    int         in3_;    /* iflag      */
-    int         in4_;    /* n_trans    */
-    double      in5_;    /* tol        */
-    finufft_plan*  in6_ =0; /* plan       */
-    finufft_opts*  in7_ =0; /* o          */
-    int         out0_;   /* ier        */
-    mwSize      dim8_;   /* 3          */
-
-    dim8_ = (mwSize) mxWrapGetScalar(prhs[8], &mw_err_txt_);
-
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != dim8_) {
-        mw_err_txt_ = "Bad argument size: n_modes";        goto mw_err_label;
-    }
-
-    if( mxGetClassID(prhs[0]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in0_ = (int) mxWrapGetScalar(prhs[0], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in1_ = (int) mxWrapGetScalar(prhs[1], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) {
-        in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in2_ = NULL;
-    if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+static const char *stubids8_ = "o int = finufft_makeplan(i int, i int, i int64_t[x], i "
+                               "int, i int, i double, i finufft_plan*, i finufft_opts*)";
+
+void mexStub8(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  int in0_;               /* type       */
+  int in1_;               /* dim        */
+  int64_t *in2_ = 0;      /* n_modes    */
+  int in3_;               /* iflag      */
+  int in4_;               /* n_trans    */
+  double in5_;            /* tol        */
+  finufft_plan *in6_ = 0; /* plan       */
+  finufft_opts *in7_ = 0; /* o          */
+  int out0_;              /* ier        */
+  mwSize dim8_;           /* 3          */
+
+  dim8_ = (mwSize)mxWrapGetScalar(prhs[8], &mw_err_txt_);
+
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != dim8_) {
+    mw_err_txt_ = "Bad argument size: n_modes";
+    goto mw_err_label;
+  }
+
+  if (mxGetClassID(prhs[0]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in0_ = (int)mxWrapGetScalar(prhs[0], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in1_ = (int)mxWrapGetScalar(prhs[1], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) {
+    in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_);
     if (mw_err_txt_) goto mw_err_label;
-    in3_ = (int) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in4_ = (int) mxWrapGetScalar(prhs[4], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in5_ = (double) mxWrapGetScalar(prhs[5], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    in6_ = (finufft_plan*) mxWrapGetP(prhs[6], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    in7_ = (finufft_opts*) mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[8]++;
-    out0_ = finufft_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_);
+  } else
+    in2_ = NULL;
+  if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in3_ = (int)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in4_ = (int)mxWrapGetScalar(prhs[4], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in5_ = (double)mxWrapGetScalar(prhs[5], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  in6_ = (finufft_plan *)mxWrapGetP(prhs[6], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  in7_ = (finufft_opts *)mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[8]++;
+  out0_ = finufft_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
 
 mw_err_label:
-    if (in2_)  mxFree(in2_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in2_) mxFree(in2_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 190 ----
- * int ier = finufftf_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, float tol, finufftf_plan* plan, finufft_opts* o);
+ * int ier = finufftf_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int
+ * n_trans, float tol, finufftf_plan* plan, finufft_opts* o);
  */
-static const char* stubids9_ = "o int = finufftf_makeplan(i int, i int, i int64_t[x], i int, i int, i float, i finufftf_plan*, i finufft_opts*)";
-
-void mexStub9(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    int         in0_;    /* type       */
-    int         in1_;    /* dim        */
-    int64_t*    in2_ =0; /* n_modes    */
-    int         in3_;    /* iflag      */
-    int         in4_;    /* n_trans    */
-    float       in5_;    /* tol        */
-    finufftf_plan*  in6_ =0; /* plan       */
-    finufft_opts*  in7_ =0; /* o          */
-    int         out0_;   /* ier        */
-    mwSize      dim8_;   /* 3          */
-
-    dim8_ = (mwSize) mxWrapGetScalar(prhs[8], &mw_err_txt_);
-
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != dim8_) {
-        mw_err_txt_ = "Bad argument size: n_modes";        goto mw_err_label;
-    }
-
-    if( mxGetClassID(prhs[0]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in0_ = (int) mxWrapGetScalar(prhs[0], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in1_ = (int) mxWrapGetScalar(prhs[1], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) {
-        in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in2_ = NULL;
-    if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+static const char *stubids9_ = "o int = finufftf_makeplan(i int, i int, i int64_t[x], i "
+                               "int, i int, i float, i finufftf_plan*, i finufft_opts*)";
+
+void mexStub9(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  int in0_;                /* type       */
+  int in1_;                /* dim        */
+  int64_t *in2_ = 0;       /* n_modes    */
+  int in3_;                /* iflag      */
+  int in4_;                /* n_trans    */
+  float in5_;              /* tol        */
+  finufftf_plan *in6_ = 0; /* plan       */
+  finufft_opts *in7_  = 0; /* o          */
+  int out0_;               /* ier        */
+  mwSize dim8_;            /* 3          */
+
+  dim8_ = (mwSize)mxWrapGetScalar(prhs[8], &mw_err_txt_);
+
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != dim8_) {
+    mw_err_txt_ = "Bad argument size: n_modes";
+    goto mw_err_label;
+  }
+
+  if (mxGetClassID(prhs[0]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in0_ = (int)mxWrapGetScalar(prhs[0], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in1_ = (int)mxWrapGetScalar(prhs[1], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) {
+    in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_);
     if (mw_err_txt_) goto mw_err_label;
-    in3_ = (int) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in4_ = (int) mxWrapGetScalar(prhs[4], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[5]) != mxSINGLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxSINGLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in5_ = (float) mxWrapGetScalar_single(prhs[5], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    in6_ = (finufftf_plan*) mxWrapGetP(prhs[6], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    in7_ = (finufft_opts*) mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[9]++;
-    out0_ = finufftf_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_);
+  } else
+    in2_ = NULL;
+  if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in3_ = (int)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in4_ = (int)mxWrapGetScalar(prhs[4], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[5]) != mxSINGLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxSINGLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in5_ = (float)mxWrapGetScalar_single(prhs[5], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  in6_ = (finufftf_plan *)mxWrapGetP(prhs[6], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  in7_ = (finufft_opts *)mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[9]++;
+  out0_ = finufftf_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
 
 mw_err_label:
-    if (in2_)  mxFree(in2_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in2_) mxFree(in2_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 192 ----
  * delete(finufft_opts* o);
  */
-static const char* stubids10_ = "delete(i finufft_opts*)";
+static const char *stubids10_ = "delete(i finufft_opts*)";
 
-void mexStub10(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_opts*  in0_ =0; /* o          */
+void mexStub10(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_opts *in0_      = 0; /* o          */
 
-    in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[10]++;
-    delete(in0_);
+  in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[10]++;
+  delete (in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 222 ----
- * int ier = finufft_setpts(finufft_plan plan, int64_t nj, double[] xj, double[] yj, double[] zj, int64_t nk, double[] s, double[] t, double[] u);
+ * int ier = finufft_setpts(finufft_plan plan, int64_t nj, double[] xj, double[] yj,
+ * double[] zj, int64_t nk, double[] s, double[] t, double[] u);
  */
-static const char* stubids11_ = "o int = finufft_setpts(i finufft_plan, i int64_t, i double[], i double[], i double[], i int64_t, i double[], i double[], i double[])";
-
-void mexStub11(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-    int64_t     in1_;    /* nj         */
-    double*     in2_ =0; /* xj         */
-    double*     in3_ =0; /* yj         */
-    double*     in4_ =0; /* zj         */
-    int64_t     in5_;    /* nk         */
-    double*     in6_ =0; /* s          */
-    double*     in7_ =0; /* t          */
-    double*     in8_ =0; /* u          */
-    int         out0_;   /* ier        */
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+static const char *stubids11_ =
+    "o int = finufft_setpts(i finufft_plan, i int64_t, i double[], i double[], i "
+    "double[], i int64_t, i double[], i double[], i double[])";
+
+void mexStub11(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+  int64_t in1_;                /* nj         */
+  double *in2_ = 0;            /* xj         */
+  double *in3_ = 0;            /* yj         */
+  double *in4_ = 0;            /* zj         */
+  int64_t in5_;                /* nk         */
+  double *in6_ = 0;            /* s          */
+  double *in7_ = 0;            /* t          */
+  double *in8_ = 0;            /* u          */
+  int out0_;                   /* ier        */
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in1_ = (int64_t)mxWrapGetScalar(prhs[1], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) {
+    if (mxGetClassID(prhs[2]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
     if (mw_err_txt_) goto mw_err_label;
-    in1_ = (int64_t) mxWrapGetScalar(prhs[1], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) {
-        if( mxGetClassID(prhs[2]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in2_ = mxGetDoubles(prhs[2]);
+    in2_ = mxGetDoubles(prhs[2]);
 #else
-        in2_ = mxGetPr(prhs[2]);
+    in2_ = mxGetPr(prhs[2]);
 #endif
-    } else
-        in2_ = NULL;
-    if (mxGetM(prhs[3])*mxGetN(prhs[3]) != 0) {
-        if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in2_ = NULL;
+  if (mxGetM(prhs[3]) * mxGetN(prhs[3]) != 0) {
+    if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in3_ = mxGetDoubles(prhs[3]);
+    in3_ = mxGetDoubles(prhs[3]);
 #else
-        in3_ = mxGetPr(prhs[3]);
+    in3_ = mxGetPr(prhs[3]);
 #endif
-    } else
-        in3_ = NULL;
-    if (mxGetM(prhs[4])*mxGetN(prhs[4]) != 0) {
-        if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in3_ = NULL;
+  if (mxGetM(prhs[4]) * mxGetN(prhs[4]) != 0) {
+    if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in4_ = mxGetDoubles(prhs[4]);
+    in4_ = mxGetDoubles(prhs[4]);
 #else
-        in4_ = mxGetPr(prhs[4]);
+    in4_ = mxGetPr(prhs[4]);
 #endif
-    } else
-        in4_ = NULL;
-    if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  } else
+    in4_ = NULL;
+  if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in5_ = (int64_t)mxWrapGetScalar(prhs[5], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[6]) * mxGetN(prhs[6]) != 0) {
+    if (mxGetClassID(prhs[6]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
     if (mw_err_txt_) goto mw_err_label;
-    in5_ = (int64_t) mxWrapGetScalar(prhs[5], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[6])*mxGetN(prhs[6]) != 0) {
-        if( mxGetClassID(prhs[6]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in6_ = mxGetDoubles(prhs[6]);
+    in6_ = mxGetDoubles(prhs[6]);
 #else
-        in6_ = mxGetPr(prhs[6]);
+    in6_ = mxGetPr(prhs[6]);
 #endif
-    } else
-        in6_ = NULL;
-    if (mxGetM(prhs[7])*mxGetN(prhs[7]) != 0) {
-        if( mxGetClassID(prhs[7]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in6_ = NULL;
+  if (mxGetM(prhs[7]) * mxGetN(prhs[7]) != 0) {
+    if (mxGetClassID(prhs[7]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in7_ = mxGetDoubles(prhs[7]);
+    in7_ = mxGetDoubles(prhs[7]);
 #else
-        in7_ = mxGetPr(prhs[7]);
+    in7_ = mxGetPr(prhs[7]);
 #endif
-    } else
-        in7_ = NULL;
-    if (mxGetM(prhs[8])*mxGetN(prhs[8]) != 0) {
-        if( mxGetClassID(prhs[8]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in7_ = NULL;
+  if (mxGetM(prhs[8]) * mxGetN(prhs[8]) != 0) {
+    if (mxGetClassID(prhs[8]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in8_ = mxGetDoubles(prhs[8]);
+    in8_ = mxGetDoubles(prhs[8]);
 #else
-        in8_ = mxGetPr(prhs[8]);
+    in8_ = mxGetPr(prhs[8]);
 #endif
-    } else
-        in8_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    if (mexprofrecord_)
-        mexprofrecord_[11]++;
-    out0_ = finufft_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_);
+  } else
+    in8_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  if (mexprofrecord_) mexprofrecord_[11]++;
+  out0_ = finufft_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 224 ----
- * int ier = finufftf_setpts(finufftf_plan plan, int64_t nj, float[] xj, float[] yj, float[] zj, int64_t nk, float[] s, float[] t, float[] u);
+ * int ier = finufftf_setpts(finufftf_plan plan, int64_t nj, float[] xj, float[] yj,
+ * float[] zj, int64_t nk, float[] s, float[] t, float[] u);
  */
-static const char* stubids12_ = "o int = finufftf_setpts(i finufftf_plan, i int64_t, i float[], i float[], i float[], i int64_t, i float[], i float[], i float[])";
-
-void mexStub12(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-    int64_t     in1_;    /* nj         */
-    float*      in2_ =0; /* xj         */
-    float*      in3_ =0; /* yj         */
-    float*      in4_ =0; /* zj         */
-    int64_t     in5_;    /* nk         */
-    float*      in6_ =0; /* s          */
-    float*      in7_ =0; /* t          */
-    float*      in8_ =0; /* u          */
-    int         out0_;   /* ier        */
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+static const char *stubids12_ =
+    "o int = finufftf_setpts(i finufftf_plan, i int64_t, i float[], i float[], i "
+    "float[], i int64_t, i float[], i float[], i float[])";
+
+void mexStub12(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+  int64_t in1_;                /* nj         */
+  float *in2_ = 0;             /* xj         */
+  float *in3_ = 0;             /* yj         */
+  float *in4_ = 0;             /* zj         */
+  int64_t in5_;                /* nk         */
+  float *in6_ = 0;             /* s          */
+  float *in7_ = 0;             /* t          */
+  float *in8_ = 0;             /* u          */
+  int out0_;                   /* ier        */
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in1_ = (int64_t)mxWrapGetScalar(prhs[1], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) {
+    if (mxGetClassID(prhs[2]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
     if (mw_err_txt_) goto mw_err_label;
-    in1_ = (int64_t) mxWrapGetScalar(prhs[1], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) {
-        if( mxGetClassID(prhs[2]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in2_ = mxGetSingles(prhs[2]);
+    in2_ = mxGetSingles(prhs[2]);
 #else
-        in2_ = (float*) mxGetData(prhs[2]);
+    in2_ = (float *)mxGetData(prhs[2]);
 #endif
-    } else
-        in2_ = NULL;
-    if (mxGetM(prhs[3])*mxGetN(prhs[3]) != 0) {
-        if( mxGetClassID(prhs[3]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in2_ = NULL;
+  if (mxGetM(prhs[3]) * mxGetN(prhs[3]) != 0) {
+    if (mxGetClassID(prhs[3]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in3_ = mxGetSingles(prhs[3]);
+    in3_ = mxGetSingles(prhs[3]);
 #else
-        in3_ = (float*) mxGetData(prhs[3]);
+    in3_ = (float *)mxGetData(prhs[3]);
 #endif
-    } else
-        in3_ = NULL;
-    if (mxGetM(prhs[4])*mxGetN(prhs[4]) != 0) {
-        if( mxGetClassID(prhs[4]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in3_ = NULL;
+  if (mxGetM(prhs[4]) * mxGetN(prhs[4]) != 0) {
+    if (mxGetClassID(prhs[4]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in4_ = mxGetSingles(prhs[4]);
+    in4_ = mxGetSingles(prhs[4]);
 #else
-        in4_ = (float*) mxGetData(prhs[4]);
+    in4_ = (float *)mxGetData(prhs[4]);
 #endif
-    } else
-        in4_ = NULL;
-    if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  } else
+    in4_ = NULL;
+  if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in5_ = (int64_t)mxWrapGetScalar(prhs[5], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[6]) * mxGetN(prhs[6]) != 0) {
+    if (mxGetClassID(prhs[6]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
     if (mw_err_txt_) goto mw_err_label;
-    in5_ = (int64_t) mxWrapGetScalar(prhs[5], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[6])*mxGetN(prhs[6]) != 0) {
-        if( mxGetClassID(prhs[6]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in6_ = mxGetSingles(prhs[6]);
+    in6_ = mxGetSingles(prhs[6]);
 #else
-        in6_ = (float*) mxGetData(prhs[6]);
+    in6_ = (float *)mxGetData(prhs[6]);
 #endif
-    } else
-        in6_ = NULL;
-    if (mxGetM(prhs[7])*mxGetN(prhs[7]) != 0) {
-        if( mxGetClassID(prhs[7]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in6_ = NULL;
+  if (mxGetM(prhs[7]) * mxGetN(prhs[7]) != 0) {
+    if (mxGetClassID(prhs[7]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in7_ = mxGetSingles(prhs[7]);
+    in7_ = mxGetSingles(prhs[7]);
 #else
-        in7_ = (float*) mxGetData(prhs[7]);
+    in7_ = (float *)mxGetData(prhs[7]);
 #endif
-    } else
-        in7_ = NULL;
-    if (mxGetM(prhs[8])*mxGetN(prhs[8]) != 0) {
-        if( mxGetClassID(prhs[8]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in7_ = NULL;
+  if (mxGetM(prhs[8]) * mxGetN(prhs[8]) != 0) {
+    if (mxGetClassID(prhs[8]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in8_ = mxGetSingles(prhs[8]);
+    in8_ = mxGetSingles(prhs[8]);
 #else
-        in8_ = (float*) mxGetData(prhs[8]);
+    in8_ = (float *)mxGetData(prhs[8]);
 #endif
-    } else
-        in8_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    if (mexprofrecord_)
-        mexprofrecord_[12]++;
-    out0_ = finufftf_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_);
+  } else
+    in8_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  if (mexprofrecord_) mexprofrecord_[12]++;
+  out0_ = finufftf_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 251 ----
- * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[ncoeffs] result);
+ * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output
+ * dcomplex[ncoeffs] result);
  */
-static const char* stubids13_ = "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[x])";
-
-void mexStub13(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-    dcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    dcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* ncoeffs    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (dcomplex*) mxMalloc(dim2_*sizeof(dcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[13]++;
-    out0_ = finufft_execute(*in0_, in1_, out1_);
+static const char *stubids13_ =
+    "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[x])";
+
+void mexStub13(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+  dcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  dcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* ncoeffs    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (dcomplex *)mxMalloc(dim2_ * sizeof(dcomplex));
+  if (mexprofrecord_) mexprofrecord_[13]++;
+  out0_ = finufft_execute(*in0_, in1_, out1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateDoubleMatrix(dim2_, 1, mxCOMPLEX);
-    mxWrapCopy_dcomplex(plhs[1], out1_, dim2_);
+  plhs[1] = mxCreateDoubleMatrix(dim2_, 1, mxCOMPLEX);
+  mxWrapCopy_dcomplex(plhs[1], out1_, dim2_);
 
 mw_err_label:
-    if (in1_)  mxFree(in1_);
-    if (out1_) mxFree(out1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in1_) mxFree(in1_);
+  if (out1_) mxFree(out1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 253 ----
- * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[ncoeffs] result);
+ * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output
+ * fcomplex[ncoeffs] result);
  */
-static const char* stubids14_ = "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[x])";
-
-void mexStub14(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-    fcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    fcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* ncoeffs    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (fcomplex*) mxMalloc(dim2_*sizeof(fcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[14]++;
-    out0_ = finufftf_execute(*in0_, in1_, out1_);
+static const char *stubids14_ =
+    "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[x])";
+
+void mexStub14(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+  fcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  fcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* ncoeffs    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (fcomplex *)mxMalloc(dim2_ * sizeof(fcomplex));
+  if (mexprofrecord_) mexprofrecord_[14]++;
+  out0_ = finufftf_execute(*in0_, in1_, out1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateNumericMatrix(dim2_, 1, mxSINGLE_CLASS, mxCOMPLEX);
-    mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_);
+  plhs[1] = mxCreateNumericMatrix(dim2_, 1, mxSINGLE_CLASS, mxCOMPLEX);
+  mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_);
 
 mw_err_label:
-    if (in1_)  mxFree(in1_);
-    if (out1_) mxFree(out1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in1_) mxFree(in1_);
+  if (out1_) mxFree(out1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 259 ----
- * int ier = finufft_execute(finufft_plan plan, output dcomplex[nj, n_trans] result, dcomplex[] data_in);
+ * int ier = finufft_execute(finufft_plan plan, output dcomplex[nj, n_trans] result,
+ * dcomplex[] data_in);
  */
-static const char* stubids15_ = "o int = finufft_execute(i finufft_plan, o dcomplex[xx], i dcomplex[])";
-
-void mexStub15(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-    dcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    dcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* nj         */
-    mwSize      dim3_;   /* n_trans    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-    dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (dcomplex*) mxMalloc(dim2_*dim3_*sizeof(dcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[15]++;
-    out0_ = finufft_execute(*in0_, out1_, in1_);
+static const char *stubids15_ =
+    "o int = finufft_execute(i finufft_plan, o dcomplex[xx], i dcomplex[])";
+
+void mexStub15(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+  dcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  dcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* nj         */
+  mwSize dim3_;                /* n_trans    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+  dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (dcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(dcomplex));
+  if (mexprofrecord_) mexprofrecord_[15]++;
+  out0_ = finufft_execute(*in0_, out1_, in1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX);
-    mxWrapCopy_dcomplex(plhs[1], out1_, dim2_*dim3_);
+  plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX);
+  mxWrapCopy_dcomplex(plhs[1], out1_, dim2_ * dim3_);
 
 mw_err_label:
-    if (out1_) mxFree(out1_);
-    if (in1_)  mxFree(in1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (out1_) mxFree(out1_);
+  if (in1_) mxFree(in1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 261 ----
- * int ier = finufftf_execute(finufftf_plan plan, output fcomplex[nj, n_trans] result, fcomplex[] data_in);
+ * int ier = finufftf_execute(finufftf_plan plan, output fcomplex[nj, n_trans] result,
+ * fcomplex[] data_in);
  */
-static const char* stubids16_ = "o int = finufftf_execute(i finufftf_plan, o fcomplex[xx], i fcomplex[])";
-
-void mexStub16(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-    fcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    fcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* nj         */
-    mwSize      dim3_;   /* n_trans    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-    dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (fcomplex*) mxMalloc(dim2_*dim3_*sizeof(fcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[16]++;
-    out0_ = finufftf_execute(*in0_, out1_, in1_);
+static const char *stubids16_ =
+    "o int = finufftf_execute(i finufftf_plan, o fcomplex[xx], i fcomplex[])";
+
+void mexStub16(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+  fcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  fcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* nj         */
+  mwSize dim3_;                /* n_trans    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+  dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (fcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(fcomplex));
+  if (mexprofrecord_) mexprofrecord_[16]++;
+  out0_ = finufftf_execute(*in0_, out1_, in1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX);
-    mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_*dim3_);
+  plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX);
+  mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_ * dim3_);
 
 mw_err_label:
-    if (out1_) mxFree(out1_);
-    if (in1_)  mxFree(in1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (out1_) mxFree(out1_);
+  if (in1_) mxFree(in1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 265 ----
- * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[nk, n_trans] result);
+ * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[nk,
+ * n_trans] result);
  */
-static const char* stubids17_ = "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[xx])";
-
-void mexStub17(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-    dcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    dcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* nk         */
-    mwSize      dim3_;   /* n_trans    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-    dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (dcomplex*) mxMalloc(dim2_*dim3_*sizeof(dcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[17]++;
-    out0_ = finufft_execute(*in0_, in1_, out1_);
+static const char *stubids17_ =
+    "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[xx])";
+
+void mexStub17(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+  dcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  dcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* nk         */
+  mwSize dim3_;                /* n_trans    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+  dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (dcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(dcomplex));
+  if (mexprofrecord_) mexprofrecord_[17]++;
+  out0_ = finufft_execute(*in0_, in1_, out1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX);
-    mxWrapCopy_dcomplex(plhs[1], out1_, dim2_*dim3_);
+  plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX);
+  mxWrapCopy_dcomplex(plhs[1], out1_, dim2_ * dim3_);
 
 mw_err_label:
-    if (in1_)  mxFree(in1_);
-    if (out1_) mxFree(out1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in1_) mxFree(in1_);
+  if (out1_) mxFree(out1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 267 ----
- * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[nk, n_trans] result);
+ * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[nk,
+ * n_trans] result);
  */
-static const char* stubids18_ = "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[xx])";
-
-void mexStub18(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-    fcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    fcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* nk         */
-    mwSize      dim3_;   /* n_trans    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-    dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (fcomplex*) mxMalloc(dim2_*dim3_*sizeof(fcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[18]++;
-    out0_ = finufftf_execute(*in0_, in1_, out1_);
+static const char *stubids18_ =
+    "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[xx])";
+
+void mexStub18(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+  fcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  fcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* nk         */
+  mwSize dim3_;                /* n_trans    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+  dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (fcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(fcomplex));
+  if (mexprofrecord_) mexprofrecord_[18]++;
+  out0_ = finufftf_execute(*in0_, in1_, out1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX);
-    mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_*dim3_);
+  plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX);
+  mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_ * dim3_);
 
 mw_err_label:
-    if (in1_)  mxFree(in1_);
-    if (out1_) mxFree(out1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in1_) mxFree(in1_);
+  if (out1_) mxFree(out1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 279 ----
  * finufft_destroy(finufft_plan plan);
  */
-static const char* stubids19_ = "finufft_destroy(i finufft_plan)";
-
-void mexStub19(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    if (mexprofrecord_)
-        mexprofrecord_[19]++;
-    finufft_destroy(*in0_);
+static const char *stubids19_ = "finufft_destroy(i finufft_plan)";
+
+void mexStub19(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  if (mexprofrecord_) mexprofrecord_[19]++;
+  finufft_destroy(*in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 281 ----
  * finufftf_destroy(finufftf_plan plan);
  */
-static const char* stubids20_ = "finufftf_destroy(i finufftf_plan)";
-
-void mexStub20(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    if (mexprofrecord_)
-        mexprofrecord_[20]++;
-    finufftf_destroy(*in0_);
+static const char *stubids20_ = "finufftf_destroy(i finufftf_plan)";
+
+void mexStub20(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  if (mexprofrecord_) mexprofrecord_[20]++;
+  finufftf_destroy(*in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ----
  */
-void mexFunction(int nlhs, mxArray* plhs[],
-                 int nrhs, const mxArray* prhs[])
-{
-    char id[512];
-    if (nrhs == 0) {
-        mexPrintf("Mex function installed\n");
-        return;
+void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  char id[512];
+  if (nrhs == 0) {
+    mexPrintf("Mex function installed\n");
+    return;
+  }
+
+  if (mxGetString(prhs[0], id, sizeof(id)) != 0)
+    mexErrMsgTxt("Identifier should be a string");
+  else if (strcmp(id, stubids1_) == 0)
+    mexStub1(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids2_) == 0)
+    mexStub2(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids3_) == 0)
+    mexStub3(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids4_) == 0)
+    mexStub4(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids5_) == 0)
+    mexStub5(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids6_) == 0)
+    mexStub6(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids7_) == 0)
+    mexStub7(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids8_) == 0)
+    mexStub8(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids9_) == 0)
+    mexStub9(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids10_) == 0)
+    mexStub10(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids11_) == 0)
+    mexStub11(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids12_) == 0)
+    mexStub12(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids13_) == 0)
+    mexStub13(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids14_) == 0)
+    mexStub14(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids15_) == 0)
+    mexStub15(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids16_) == 0)
+    mexStub16(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids17_) == 0)
+    mexStub17(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids18_) == 0)
+    mexStub18(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids19_) == 0)
+    mexStub19(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids20_) == 0)
+    mexStub20(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, "*profile on*") == 0) {
+    if (!mexprofrecord_) {
+      mexprofrecord_ = (int *)malloc(21 * sizeof(int));
+      mexLock();
     }
-
-    if (mxGetString(prhs[0], id, sizeof(id)) != 0)
-        mexErrMsgTxt("Identifier should be a string");
-    else if (strcmp(id, stubids1_) == 0)
-        mexStub1(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids2_) == 0)
-        mexStub2(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids3_) == 0)
-        mexStub3(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids4_) == 0)
-        mexStub4(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids5_) == 0)
-        mexStub5(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids6_) == 0)
-        mexStub6(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids7_) == 0)
-        mexStub7(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids8_) == 0)
-        mexStub8(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids9_) == 0)
-        mexStub9(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids10_) == 0)
-        mexStub10(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids11_) == 0)
-        mexStub11(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids12_) == 0)
-        mexStub12(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids13_) == 0)
-        mexStub13(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids14_) == 0)
-        mexStub14(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids15_) == 0)
-        mexStub15(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids16_) == 0)
-        mexStub16(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids17_) == 0)
-        mexStub17(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids18_) == 0)
-        mexStub18(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids19_) == 0)
-        mexStub19(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids20_) == 0)
-        mexStub20(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, "*profile on*") == 0) {
-        if (!mexprofrecord_) {
-            mexprofrecord_ = (int*) malloc(21 * sizeof(int));
-            mexLock();
-        }
-        memset(mexprofrecord_, 0, 21 * sizeof(int));
-    } else if (strcmp(id, "*profile off*") == 0) {
-        if (mexprofrecord_) {
-            free(mexprofrecord_);
-            mexUnlock();
-        }
-        mexprofrecord_ = NULL;
-    } else if (strcmp(id, "*profile report*") == 0) {
-        if (!mexprofrecord_)
-            mexPrintf("Profiler inactive\n");
-        mexPrintf("%d calls to finufft.mw:166\n", mexprofrecord_[1]);
-        mexPrintf("%d calls to finufft.mw:167\n", mexprofrecord_[2]);
-        mexPrintf("%d calls to finufft.mw:169\n", mexprofrecord_[3]);
-        mexPrintf("%d calls to finufft.mw:170\n", mexprofrecord_[4]);
-        mexPrintf("%d calls to finufft.mw:172\n", mexprofrecord_[5]);
-        mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[6]);
-        mexPrintf("%d calls to finufft.mw:184\n", mexprofrecord_[7]);
-        mexPrintf("%d calls to finufft.mw:187\n", mexprofrecord_[8]);
-        mexPrintf("%d calls to finufft.mw:190\n", mexprofrecord_[9]);
-        mexPrintf("%d calls to finufft.mw:192\n", mexprofrecord_[10]);
-        mexPrintf("%d calls to finufft.mw:222\n", mexprofrecord_[11]);
-        mexPrintf("%d calls to finufft.mw:224\n", mexprofrecord_[12]);
-        mexPrintf("%d calls to finufft.mw:251\n", mexprofrecord_[13]);
-        mexPrintf("%d calls to finufft.mw:253\n", mexprofrecord_[14]);
-        mexPrintf("%d calls to finufft.mw:259\n", mexprofrecord_[15]);
-        mexPrintf("%d calls to finufft.mw:261\n", mexprofrecord_[16]);
-        mexPrintf("%d calls to finufft.mw:265\n", mexprofrecord_[17]);
-        mexPrintf("%d calls to finufft.mw:267\n", mexprofrecord_[18]);
-        mexPrintf("%d calls to finufft.mw:279\n", mexprofrecord_[19]);
-        mexPrintf("%d calls to finufft.mw:281\n", mexprofrecord_[20]);
-    } else if (strcmp(id, "*profile log*") == 0) {
-        FILE* logfp;
-        if (nrhs != 2 || mxGetString(prhs[1], id, sizeof(id)) != 0)
-            mexErrMsgTxt("Must have two string arguments");
-        logfp = fopen(id, "w+");
-        if (!logfp)
-            mexErrMsgTxt("Cannot open log for output");
-        if (!mexprofrecord_)
-            fprintf(logfp, "Profiler inactive\n");
-        fprintf(logfp, "%d calls to finufft.mw:166\n", mexprofrecord_[1]);
-        fprintf(logfp, "%d calls to finufft.mw:167\n", mexprofrecord_[2]);
-        fprintf(logfp, "%d calls to finufft.mw:169\n", mexprofrecord_[3]);
-        fprintf(logfp, "%d calls to finufft.mw:170\n", mexprofrecord_[4]);
-        fprintf(logfp, "%d calls to finufft.mw:172\n", mexprofrecord_[5]);
-        fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[6]);
-        fprintf(logfp, "%d calls to finufft.mw:184\n", mexprofrecord_[7]);
-        fprintf(logfp, "%d calls to finufft.mw:187\n", mexprofrecord_[8]);
-        fprintf(logfp, "%d calls to finufft.mw:190\n", mexprofrecord_[9]);
-        fprintf(logfp, "%d calls to finufft.mw:192\n", mexprofrecord_[10]);
-        fprintf(logfp, "%d calls to finufft.mw:222\n", mexprofrecord_[11]);
-        fprintf(logfp, "%d calls to finufft.mw:224\n", mexprofrecord_[12]);
-        fprintf(logfp, "%d calls to finufft.mw:251\n", mexprofrecord_[13]);
-        fprintf(logfp, "%d calls to finufft.mw:253\n", mexprofrecord_[14]);
-        fprintf(logfp, "%d calls to finufft.mw:259\n", mexprofrecord_[15]);
-        fprintf(logfp, "%d calls to finufft.mw:261\n", mexprofrecord_[16]);
-        fprintf(logfp, "%d calls to finufft.mw:265\n", mexprofrecord_[17]);
-        fprintf(logfp, "%d calls to finufft.mw:267\n", mexprofrecord_[18]);
-        fprintf(logfp, "%d calls to finufft.mw:279\n", mexprofrecord_[19]);
-        fprintf(logfp, "%d calls to finufft.mw:281\n", mexprofrecord_[20]);
-        fclose(logfp);
-    } else
-        mexErrMsgTxt("Unknown identifier");
+    memset(mexprofrecord_, 0, 21 * sizeof(int));
+  } else if (strcmp(id, "*profile off*") == 0) {
+    if (mexprofrecord_) {
+      free(mexprofrecord_);
+      mexUnlock();
+    }
+    mexprofrecord_ = NULL;
+  } else if (strcmp(id, "*profile report*") == 0) {
+    if (!mexprofrecord_) mexPrintf("Profiler inactive\n");
+    mexPrintf("%d calls to finufft.mw:166\n", mexprofrecord_[1]);
+    mexPrintf("%d calls to finufft.mw:167\n", mexprofrecord_[2]);
+    mexPrintf("%d calls to finufft.mw:169\n", mexprofrecord_[3]);
+    mexPrintf("%d calls to finufft.mw:170\n", mexprofrecord_[4]);
+    mexPrintf("%d calls to finufft.mw:172\n", mexprofrecord_[5]);
+    mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[6]);
+    mexPrintf("%d calls to finufft.mw:184\n", mexprofrecord_[7]);
+    mexPrintf("%d calls to finufft.mw:187\n", mexprofrecord_[8]);
+    mexPrintf("%d calls to finufft.mw:190\n", mexprofrecord_[9]);
+    mexPrintf("%d calls to finufft.mw:192\n", mexprofrecord_[10]);
+    mexPrintf("%d calls to finufft.mw:222\n", mexprofrecord_[11]);
+    mexPrintf("%d calls to finufft.mw:224\n", mexprofrecord_[12]);
+    mexPrintf("%d calls to finufft.mw:251\n", mexprofrecord_[13]);
+    mexPrintf("%d calls to finufft.mw:253\n", mexprofrecord_[14]);
+    mexPrintf("%d calls to finufft.mw:259\n", mexprofrecord_[15]);
+    mexPrintf("%d calls to finufft.mw:261\n", mexprofrecord_[16]);
+    mexPrintf("%d calls to finufft.mw:265\n", mexprofrecord_[17]);
+    mexPrintf("%d calls to finufft.mw:267\n", mexprofrecord_[18]);
+    mexPrintf("%d calls to finufft.mw:279\n", mexprofrecord_[19]);
+    mexPrintf("%d calls to finufft.mw:281\n", mexprofrecord_[20]);
+  } else if (strcmp(id, "*profile log*") == 0) {
+    FILE *logfp;
+    if (nrhs != 2 || mxGetString(prhs[1], id, sizeof(id)) != 0)
+      mexErrMsgTxt("Must have two string arguments");
+    logfp = fopen(id, "w+");
+    if (!logfp) mexErrMsgTxt("Cannot open log for output");
+    if (!mexprofrecord_) fprintf(logfp, "Profiler inactive\n");
+    fprintf(logfp, "%d calls to finufft.mw:166\n", mexprofrecord_[1]);
+    fprintf(logfp, "%d calls to finufft.mw:167\n", mexprofrecord_[2]);
+    fprintf(logfp, "%d calls to finufft.mw:169\n", mexprofrecord_[3]);
+    fprintf(logfp, "%d calls to finufft.mw:170\n", mexprofrecord_[4]);
+    fprintf(logfp, "%d calls to finufft.mw:172\n", mexprofrecord_[5]);
+    fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[6]);
+    fprintf(logfp, "%d calls to finufft.mw:184\n", mexprofrecord_[7]);
+    fprintf(logfp, "%d calls to finufft.mw:187\n", mexprofrecord_[8]);
+    fprintf(logfp, "%d calls to finufft.mw:190\n", mexprofrecord_[9]);
+    fprintf(logfp, "%d calls to finufft.mw:192\n", mexprofrecord_[10]);
+    fprintf(logfp, "%d calls to finufft.mw:222\n", mexprofrecord_[11]);
+    fprintf(logfp, "%d calls to finufft.mw:224\n", mexprofrecord_[12]);
+    fprintf(logfp, "%d calls to finufft.mw:251\n", mexprofrecord_[13]);
+    fprintf(logfp, "%d calls to finufft.mw:253\n", mexprofrecord_[14]);
+    fprintf(logfp, "%d calls to finufft.mw:259\n", mexprofrecord_[15]);
+    fprintf(logfp, "%d calls to finufft.mw:261\n", mexprofrecord_[16]);
+    fprintf(logfp, "%d calls to finufft.mw:265\n", mexprofrecord_[17]);
+    fprintf(logfp, "%d calls to finufft.mw:267\n", mexprofrecord_[18]);
+    fprintf(logfp, "%d calls to finufft.mw:279\n", mexprofrecord_[19]);
+    fprintf(logfp, "%d calls to finufft.mw:281\n", mexprofrecord_[20]);
+    fclose(logfp);
+  } else
+    mexErrMsgTxt("Unknown identifier");
 }
-
diff --git a/perftest/big2d2f.cpp b/perftest/big2d2f.cpp
index 4b59a72df..1a87067d2 100644
--- a/perftest/big2d2f.cpp
+++ b/perftest/big2d2f.cpp
@@ -10,31 +10,29 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
 #include <complex>
 #include <iostream>
 #include <omp.h>
+#include <vector>
 using namespace std;
 
-int test_finufft(finufft_opts* opts)
-{
-    size_t nj = 129*129*2;
-    size_t ms = 129, mt = 129;
-    size_t ntrans = 75000;     // the point is: 129*129*2*75000 > 2^31 ~ 2.15e9
-    std::vector<float> x(nj);   // bunch of zero data
-    std::vector<float> y(nj);
-    std::vector<std::complex<float>> cj(ntrans*nj);
-    std::vector<std::complex<float>> fk(ntrans*ms*mt);
+int test_finufft(finufft_opts *opts) {
+  size_t nj = 129 * 129 * 2;
+  size_t ms = 129, mt = 129;
+  size_t ntrans = 75000;    // the point is: 129*129*2*75000 > 2^31 ~ 2.15e9
+  std::vector<float> x(nj); // bunch of zero data
+  std::vector<float> y(nj);
+  std::vector<std::complex<float>> cj(ntrans * nj);
+  std::vector<std::complex<float>> fk(ntrans * ms * mt);
 
-    int ier = finufftf2d2many(ntrans, nj, x.data(), y.data(), cj.data(),
-                          -1, 1e-3, ms, mt, fk.data(), opts);
+  int ier = finufftf2d2many(ntrans, nj, x.data(), y.data(), cj.data(), -1, 1e-3, ms, mt,
+                            fk.data(), opts);
 
-    std::cout << "\tbig2d2f finufft status: " << ier << std::endl;
-    return ier;
+  std::cout << "\tbig2d2f finufft status: " << ier << std::endl;
+  return ier;
 }
 
-int main(int argc, char* argv[])
-{
+int main(int argc, char *argv[]) {
   finufft_opts opts;
   finufftf_default_opts(&opts);
   return test_finufft(&opts);
diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu
index 5b51fe3ac..f72ffb3e6 100644
--- a/perftest/cuda/cuperftest.cu
+++ b/perftest/cuda/cuperftest.cu
@@ -14,34 +14,34 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
-std::string get_or(const std::unordered_map<std::string, std::string> &m, const std::string &key,
-                   const std::string &default_value) {
-    auto it = m.find(key);
-    if (it == m.end()) {
-        return default_value;
-    }
-    return it->second;
+std::string get_or(const std::unordered_map<std::string, std::string> &m,
+                   const std::string &key, const std::string &default_value) {
+  auto it = m.find(key);
+  if (it == m.end()) {
+    return default_value;
+  }
+  return it->second;
 }
 
 struct test_options_t {
-    char prec;
-    int type;
-    int n_runs;
-    int N[3];
-    int M;
-    int ntransf;
-    int kerevalmethod;
-    int method;
-    int sort;
-    double tol;
-
-    test_options_t(int argc, char *argv[]) {
-        std::unordered_map<std::string, std::string> options_map;
-
-        while (true) {
-            int option_index = 0;
-
-            // clang-format off
+  char prec;
+  int type;
+  int n_runs;
+  int N[3];
+  int M;
+  int ntransf;
+  int kerevalmethod;
+  int method;
+  int sort;
+  double tol;
+
+  test_options_t(int argc, char *argv[]) {
+    std::unordered_map<std::string, std::string> options_map;
+
+    while (true) {
+      int option_index = 0;
+
+      // clang-format off
             static struct option long_options[] {
                 {"prec", required_argument, 0, 0},
                 {"type", required_argument, 0, 0},
@@ -57,251 +57,248 @@ struct test_options_t {
                 {"sort", required_argument, 0, 0},
                 {0, 0, 0, 0},
             };
-            // clang-format on
-
-            int c = getopt_long(argc, argv, "", long_options, &option_index);
-            if (c == -1)
-                break;
-
-            switch (c) {
-            case 0:
-                options_map[long_options[option_index].name] = optarg;
-                break;
-
-            default:
-                break;
-            }
-        }
-
-        prec = get_or(options_map, "prec", "f")[0];
-        type = std::stoi(get_or(options_map, "type", "1"));
-        n_runs = std::stoi(get_or(options_map, "n_runs", "10"));
-        N[0] = std::stof(get_or(options_map, "N1", "1E6"));
-        N[1] = std::stof(get_or(options_map, "N2", "1"));
-        N[2] = std::stof(get_or(options_map, "N3", "1"));
-        M = std::stof(get_or(options_map, "M", "2E6"));
-        ntransf = std::stoi(get_or(options_map, "ntransf", "1"));
-        method = std::stoi(get_or(options_map, "method", "1"));
-        kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "1"));
-        sort = std::stoi(get_or(options_map, "sort", "1"));
-        tol = std::stof(get_or(options_map, "tol", "1E-5"));
-    }
+      // clang-format on
+
+      int c = getopt_long(argc, argv, "", long_options, &option_index);
+      if (c == -1) break;
+
+      switch (c) {
+      case 0:
+        options_map[long_options[option_index].name] = optarg;
+        break;
 
-    friend std::ostream &operator<<(std::ostream &outs, const test_options_t &opts) {
-        return outs << "# prec = " << opts.prec << "\n"
-                    << "# type = " << opts.type << "\n"
-                    << "# n_runs = " << opts.n_runs << "\n"
-                    << "# N1 = " << opts.N[0] << "\n"
-                    << "# N2 = " << opts.N[1] << "\n"
-                    << "# N3 = " << opts.N[2] << "\n"
-                    << "# M = " << opts.M << "\n"
-                    << "# ntransf = " << opts.ntransf << "\n"
-                    << "# method = " << opts.method << "\n"
-                    << "# kerevalmethod = " << opts.kerevalmethod << "\n"
-                    << "# sort = " << opts.sort << "\n"
-                    << "# tol = " << opts.tol << "\n";
+      default:
+        break;
+      }
     }
+
+    prec          = get_or(options_map, "prec", "f")[0];
+    type          = std::stoi(get_or(options_map, "type", "1"));
+    n_runs        = std::stoi(get_or(options_map, "n_runs", "10"));
+    N[0]          = std::stof(get_or(options_map, "N1", "1E6"));
+    N[1]          = std::stof(get_or(options_map, "N2", "1"));
+    N[2]          = std::stof(get_or(options_map, "N3", "1"));
+    M             = std::stof(get_or(options_map, "M", "2E6"));
+    ntransf       = std::stoi(get_or(options_map, "ntransf", "1"));
+    method        = std::stoi(get_or(options_map, "method", "1"));
+    kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "1"));
+    sort          = std::stoi(get_or(options_map, "sort", "1"));
+    tol           = std::stof(get_or(options_map, "tol", "1E-5"));
+  }
+
+  friend std::ostream &operator<<(std::ostream &outs, const test_options_t &opts) {
+    return outs << "# prec = " << opts.prec << "\n"
+                << "# type = " << opts.type << "\n"
+                << "# n_runs = " << opts.n_runs << "\n"
+                << "# N1 = " << opts.N[0] << "\n"
+                << "# N2 = " << opts.N[1] << "\n"
+                << "# N3 = " << opts.N[2] << "\n"
+                << "# M = " << opts.M << "\n"
+                << "# ntransf = " << opts.ntransf << "\n"
+                << "# method = " << opts.method << "\n"
+                << "# kerevalmethod = " << opts.kerevalmethod << "\n"
+                << "# sort = " << opts.sort << "\n"
+                << "# tol = " << opts.tol << "\n";
+  }
 };
 
 struct CudaTimer {
-    CudaTimer() {}
+  CudaTimer() {}
 
-    ~CudaTimer() {
-        for (auto &event : start_)
-            cudaEventDestroy(event);
-        for (auto &event : stop_)
-            cudaEventDestroy(event);
-    }
+  ~CudaTimer() {
+    for (auto &event : start_) cudaEventDestroy(event);
+    for (auto &event : stop_) cudaEventDestroy(event);
+  }
 
-    void start() {
-        start_.push_back(cudaEvent_t{});
-        stop_.push_back(cudaEvent_t{});
+  void start() {
+    start_.push_back(cudaEvent_t{});
+    stop_.push_back(cudaEvent_t{});
 
-        cudaEventCreate(&start_.back());
-        cudaEventCreate(&stop_.back());
+    cudaEventCreate(&start_.back());
+    cudaEventCreate(&stop_.back());
 
-        cudaEventRecord(start_.back());
-    }
+    cudaEventRecord(start_.back());
+  }
 
-    void stop() { cudaEventRecord(stop_.back()); }
+  void stop() { cudaEventRecord(stop_.back()); }
 
-    void sync() {
-        for (auto &event : stop_)
-            cudaEventSynchronize(event);
-    }
+  void sync() {
+    for (auto &event : stop_) cudaEventSynchronize(event);
+  }
 
-    float mean() { return this->tot() / start_.size(); }
+  float mean() { return this->tot() / start_.size(); }
 
-    float std() {
-        float avg = this->mean();
+  float std() {
+    float avg = this->mean();
 
-        double var = 0.0;
-        for (int i = 0; i < start_.size(); ++i) {
-            float dt;
-            cudaEventElapsedTime(&dt, start_[i], stop_[i]);
-            var += (dt - avg) * (dt - avg);
-        }
-        var /= start_.size();
-
-        return sqrt(var);
+    double var = 0.0;
+    for (int i = 0; i < start_.size(); ++i) {
+      float dt;
+      cudaEventElapsedTime(&dt, start_[i], stop_[i]);
+      var += (dt - avg) * (dt - avg);
     }
+    var /= start_.size();
 
-    float tot() {
-        float dt_tot = 0.;
-        for (int i = 0; i < start_.size(); ++i) {
-            float dt;
-            cudaEventElapsedTime(&dt, start_[i], stop_[i]);
-            dt_tot += dt;
-        }
+    return sqrt(var);
+  }
 
-        return dt_tot;
+  float tot() {
+    float dt_tot = 0.;
+    for (int i = 0; i < start_.size(); ++i) {
+      float dt;
+      cudaEventElapsedTime(&dt, start_[i], stop_[i]);
+      dt_tot += dt;
     }
 
-    int count() { return start_.size(); }
+    return dt_tot;
+  }
+
+  int count() { return start_.size(); }
 
-    std::vector<cudaEvent_t> start_;
-    std::vector<cudaEvent_t> stop_;
+  std::vector<cudaEvent_t> start_;
+  std::vector<cudaEvent_t> stop_;
 };
 
-template <class F, class... Args>
-inline void timeit(F f, CudaTimer &timer, Args... args) {
-    timer.start();
-    f(args...);
-    timer.stop();
+template<class F, class... Args> inline void timeit(F f, CudaTimer &timer, Args... args) {
+  timer.start();
+  f(args...);
+  timer.stop();
 }
 
 void gpu_warmup() {
-    int nf1 = 100;
-    cufftHandle fftplan;
-    cufftPlan1d(&fftplan, nf1, CUFFT_Z2Z, 1);
-    thrust::device_vector<cufftDoubleComplex> in(nf1), out(nf1);
-    cufftExecZ2Z(fftplan, in.data().get(), out.data().get(), 1);
-    cudaDeviceSynchronize();
+  int nf1 = 100;
+  cufftHandle fftplan;
+  cufftPlan1d(&fftplan, nf1, CUFFT_Z2Z, 1);
+  thrust::device_vector<cufftDoubleComplex> in(nf1), out(nf1);
+  cufftExecZ2Z(fftplan, in.data().get(), out.data().get(), 1);
+  cudaDeviceSynchronize();
 }
 
-template <typename T>
-void run_test(test_options_t &test_opts) {
-    std::cout << test_opts;
-    const int ntransf = test_opts.ntransf;
-    const int64_t M = test_opts.M;
-    const int N = test_opts.N[0] * test_opts.N[1] * test_opts.N[2];
-    const int type = test_opts.type;
-    constexpr int iflag = 1;
-
-    thrust::host_vector<T> x(M * ntransf), y(M * ntransf), z(M * ntransf);
-    thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(N * ntransf);
-
-    thrust::device_vector<T> d_x(M * ntransf), d_y(M * ntransf), d_z(M * ntransf);
-    thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(N * ntransf);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
-    for (int64_t i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
-        y[i] = M_PI * randm11();
-        z[i] = M_PI * randm11();
-    }
-    for (int64_t i = M; i < M * ntransf; ++i) {
-        int64_t j = i % M;
-        x[i] = x[j];
-        y[i] = y[j];
-        z[i] = z[j];
+template<typename T> void run_test(test_options_t &test_opts) {
+  std::cout << test_opts;
+  const int ntransf   = test_opts.ntransf;
+  const int64_t M     = test_opts.M;
+  const int N         = test_opts.N[0] * test_opts.N[1] * test_opts.N[2];
+  const int type      = test_opts.type;
+  constexpr int iflag = 1;
+
+  thrust::host_vector<T> x(M * ntransf), y(M * ntransf), z(M * ntransf);
+  thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(N * ntransf);
+
+  thrust::device_vector<T> d_x(M * ntransf), d_y(M * ntransf), d_z(M * ntransf);
+  thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(N * ntransf);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int64_t i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    y[i] = M_PI * randm11();
+    z[i] = M_PI * randm11();
+  }
+  for (int64_t i = M; i < M * ntransf; ++i) {
+    int64_t j = i % M;
+    x[i]      = x[j];
+    y[i]      = y[j];
+    z[i]      = z[j];
+  }
+
+  if (type == 1) {
+    for (int i = 0; i < M * ntransf; i++) {
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
 
-    if (type == 1) {
-        for (int i = 0; i < M * ntransf; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-
-    } else if (type == 2) {
-        for (int i = 0; i < N * ntransf; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return;
+  } else if (type == 2) {
+    for (int i = 0; i < N * ntransf; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-
-    gpu_warmup();
-
-    cufinufft_opts opts;
-    int dim = 0;
-    for (int i = 0; i < 3; ++i)
-        dim = test_opts.N[i] > 1 ? i + 1 : dim;
-
-    cufinufft_default_opts(&opts);
-    opts.gpu_method = test_opts.method;
-    opts.gpu_sort = test_opts.sort;
-    opts.gpu_kerevalmeth = test_opts.kerevalmethod;
-
-    cufinufft_plan_t<T> *dplan;
-    CudaTimer h2d_timer, makeplan_timer, setpts_timer, execute_timer, d2h_timer, amortized_timer;
-    {
-        amortized_timer.start();
-        h2d_timer.start();
-        d_x = x, d_y = y, d_z = z;
-        if (type == 1)
-            d_c = c;
-        if (type == 2)
-            d_fk = fk;
-        h2d_timer.stop();
-
-        T *d_x_p = dim >= 1 ? d_x.data().get() : nullptr;
-        T *d_y_p = dim >= 2 ? d_y.data().get() : nullptr;
-        T *d_z_p = dim == 3 ? d_z.data().get() : nullptr;
-        cuda_complex<T> *d_c_p = (cuda_complex<T> *)d_c.data().get();
-        cuda_complex<T> *d_fk_p = (cuda_complex<T> *)d_fk.data().get();
-
-        timeit(cufinufft_makeplan_impl<T>, makeplan_timer, test_opts.type, dim, test_opts.N, iflag, ntransf,
-               test_opts.tol, &dplan, &opts);
-        for (int i = 0; i < test_opts.n_runs; ++i) {
-            timeit(cufinufft_setpts_impl<T>, setpts_timer, M, d_x_p, d_y_p, d_z_p, 0, nullptr, nullptr, nullptr, dplan);
-            timeit(cufinufft_execute_impl<T>, execute_timer, d_c_p, d_fk_p, dplan);
-        }
-
-        d2h_timer.start();
-        if (type == 1)
-            fk = d_fk;
-        if (type == 2)
-            c = d_c;
-        d2h_timer.stop();
-        
-        amortized_timer.stop();
-
-        h2d_timer.sync();
-        makeplan_timer.sync();
-        setpts_timer.sync();
-        execute_timer.sync();
-        d2h_timer.sync();
-        amortized_timer.sync();
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return;
+  }
+
+  gpu_warmup();
+
+  cufinufft_opts opts;
+  int dim = 0;
+  for (int i = 0; i < 3; ++i) dim = test_opts.N[i] > 1 ? i + 1 : dim;
+
+  cufinufft_default_opts(&opts);
+  opts.gpu_method      = test_opts.method;
+  opts.gpu_sort        = test_opts.sort;
+  opts.gpu_kerevalmeth = test_opts.kerevalmethod;
+
+  cufinufft_plan_t<T> *dplan;
+  CudaTimer h2d_timer, makeplan_timer, setpts_timer, execute_timer, d2h_timer,
+      amortized_timer;
+  {
+    amortized_timer.start();
+    h2d_timer.start();
+    d_x = x, d_y = y, d_z = z;
+    if (type == 1) d_c = c;
+    if (type == 2) d_fk = fk;
+    h2d_timer.stop();
+
+    T *d_x_p                = dim >= 1 ? d_x.data().get() : nullptr;
+    T *d_y_p                = dim >= 2 ? d_y.data().get() : nullptr;
+    T *d_z_p                = dim == 3 ? d_z.data().get() : nullptr;
+    cuda_complex<T> *d_c_p  = (cuda_complex<T> *)d_c.data().get();
+    cuda_complex<T> *d_fk_p = (cuda_complex<T> *)d_fk.data().get();
+
+    timeit(cufinufft_makeplan_impl<T>, makeplan_timer, test_opts.type, dim, test_opts.N,
+           iflag, ntransf, test_opts.tol, &dplan, &opts);
+    for (int i = 0; i < test_opts.n_runs; ++i) {
+      timeit(cufinufft_setpts_impl<T>, setpts_timer, M, d_x_p, d_y_p, d_z_p, 0, nullptr,
+             nullptr, nullptr, dplan);
+      timeit(cufinufft_execute_impl<T>, execute_timer, d_c_p, d_fk_p, dplan);
     }
 
-    const int64_t nupts_tot = M * test_opts.n_runs * ntransf;
-
-    printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n");
-    printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(),
-           h2d_timer.mean(), h2d_timer.std());
-    printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(), makeplan_timer.mean(),
-           makeplan_timer.std());
-    printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), setpts_timer.mean(), setpts_timer.std(),
-           nupts_tot * 1000 / setpts_timer.tot(), setpts_timer.tot() * 1E6 / nupts_tot);
-    printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), execute_timer.mean(),
-           execute_timer.std(), nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot);
-    printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(),
-           d2h_timer.mean(), d2h_timer.std());
-    printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), amortized_timer.mean(), amortized_timer.std(),
-           nupts_tot * 1000 / amortized_timer.tot(), amortized_timer.tot() * 1E6 / nupts_tot);
+    d2h_timer.start();
+    if (type == 1) fk = d_fk;
+    if (type == 2) c = d_c;
+    d2h_timer.stop();
+
+    amortized_timer.stop();
+
+    h2d_timer.sync();
+    makeplan_timer.sync();
+    setpts_timer.sync();
+    execute_timer.sync();
+    d2h_timer.sync();
+    amortized_timer.sync();
+  }
+
+  const int64_t nupts_tot = M * test_opts.n_runs * ntransf;
+
+  printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n");
+  printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(),
+         h2d_timer.mean(), h2d_timer.std());
+  printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(),
+         makeplan_timer.mean(), makeplan_timer.std());
+  printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(),
+         setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / setpts_timer.tot(),
+         setpts_timer.tot() * 1E6 / nupts_tot);
+  printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(),
+         execute_timer.mean(), execute_timer.std(),
+         nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot);
+  printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(),
+         d2h_timer.mean(), d2h_timer.std());
+  printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(),
+         amortized_timer.mean(), amortized_timer.std(),
+         nupts_tot * 1000 / amortized_timer.tot(),
+         amortized_timer.tot() * 1E6 / nupts_tot);
 }
 
 int main(int argc, char *argv[]) {
-    if (argc == 2 && (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h")) {
-        test_options_t default_opts(0, nullptr);
-        // clang-format off
+  if (argc == 2 && (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h")) {
+    test_options_t default_opts(0, nullptr);
+    // clang-format off
         std::cout << "Valid options:\n"
                      "    --prec <char>\n"
                      "           float or double precision. i.e. 'f' or 'd'\n"
@@ -347,15 +344,15 @@ int main(int argc, char *argv[]) {
                      "               0: do not sort the points\n"
                      "               1: sort the points\n"
                      "           default: " << default_opts.sort << "\n";
-        // clang-format on
-        return 0;
-    }
-    test_options_t opts(argc, argv);
+    // clang-format on
+    return 0;
+  }
+  test_options_t opts(argc, argv);
 
-    if (opts.prec == 'f')
-        run_test<float>(opts);
-    else if (opts.prec == 'd')
-        run_test<double>(opts);
+  if (opts.prec == 'f')
+    run_test<float>(opts);
+  else if (opts.prec == 'd')
+    run_test<double>(opts);
 
-    return 0;
+  return 0;
 }
diff --git a/perftest/guru_timing_test.cpp b/perftest/guru_timing_test.cpp
index 145d4f1ef..90055a36b 100644
--- a/perftest/guru_timing_test.cpp
+++ b/perftest/guru_timing_test.cpp
@@ -1,11 +1,8 @@
 #include <finufft/test_defs.h>
 // for sleep call
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
-#include<Windows.h>
-void sleep(unsigned long seconds)
-{
-    Sleep(seconds * 1000);
-}
+#include <Windows.h>
+void sleep(unsigned long seconds) { Sleep(seconds * 1000); }
 #else
 #include <unistd.h>
 #endif
@@ -14,11 +11,10 @@ using namespace finufft;
 using namespace finufft::utils;
 
 // forward declaration of helper to (repeatedly if needed) call finufft?d?
-double many_simple_calls(CPX *c,CPX *F,FLT*x, FLT*y, FLT*z,FINUFFT_PLAN plan);
-
+double many_simple_calls(CPX *c, CPX *F, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan);
 
 // --------------------------------------------------------------------------
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Timing-only tester for the guru interface, allowing control of many params
    and opts from the command line.
    It compares doing many transforms with same NU pts, with repeated calls to
@@ -37,10 +33,10 @@ int main(int argc, char* argv[])
    debug = 0: rel errors and overall timing
            1: timing breakdowns
            2: also spreading output
-   
+
    spread_scheme = 0: sequential maximally multithreaded spread/interp
                    1: parallel singlethreaded spread/interp, nested last batch
-   
+
    Example: guru_timing_test 100 1 2 100 100 0 1000000 1e-3 1 0 0 2 2.0
 
    The unused dimensions of Nmodes may be left as zero.
@@ -51,147 +47,159 @@ int main(int argc, char* argv[])
    added 2 extra args, 5/22/20. Moved to perftests 7/23/20.
 */
 {
-  double tsleep = 0.1;  // how long wait between tests to let FFTW settle (1.0?)
+  double tsleep = 0.1;         // how long wait between tests to let FFTW settle (1.0?)
   int ntransf, type, ndim;
-  BIGINT M, N1, N2, N3; // M = # srcs, N1,N2,N3= # modes in each dim
+  BIGINT M, N1, N2, N3;        // M = # srcs, N1,N2,N3= # modes in each dim
   double w, tol = 1e-6;
-  int isign = +1;             // choose which exponential sign to test
+  int isign = +1;              // choose which exponential sign to test
   finufft_opts opts;
-  FINUFFT_DEFAULT_OPTS(&opts);   // for guru interface
-  
+  FINUFFT_DEFAULT_OPTS(&opts); // for guru interface
+
   // Collect command line arguments ------------------------------------------
-  if (argc<8 || argc>14) {
-    fprintf(stderr,"Usage: guru_timing_test ntransf type ndim N1 N2 N3 Nsrc [tol [debug [spread_thread [maxbatchsize [spread_sort [upsampfac]]]]]]\n\teg:\tguru_timing_test 100 1 2 1e2 1e2 0 1e6 1e-3 1 0 0 2\n");
+  if (argc < 8 || argc > 14) {
+    fprintf(
+        stderr,
+        "Usage: guru_timing_test ntransf type ndim N1 N2 N3 Nsrc [tol [debug "
+        "[spread_thread [maxbatchsize [spread_sort "
+        "[upsampfac]]]]]]\n\teg:\tguru_timing_test 100 1 2 1e2 1e2 0 1e6 1e-3 1 0 0 2\n");
     return 1;
   }
-  sscanf(argv[1],"%d",&ntransf);
-  sscanf(argv[2],"%d",&type);
-  sscanf(argv[3],"%d",&ndim);
-  sscanf(argv[4],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[5],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[6],"%lf",&w); N3 = (BIGINT)w;
-  sscanf(argv[7],"%lf",&w); M = (BIGINT)w;
-  if (argc>8) sscanf(argv[8],"%lf",&tol);
-  if (argc>9) sscanf(argv[9],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;   // see output from spreader
-  if (argc>10) sscanf(argv[10], "%d", &opts.spread_thread);
-  if (argc>11) sscanf(argv[11], "%d", &opts.maxbatchsize); 
-  if (argc>12) sscanf(argv[12],"%d",&opts.spread_sort);
-  if (argc>13) { sscanf(argv[13],"%lf",&w); opts.upsampfac = (FLT)w; }
-
-  // Allocate and initialize input -------------------------------------------  
+  sscanf(argv[1], "%d", &ntransf);
+  sscanf(argv[2], "%d", &type);
+  sscanf(argv[3], "%d", &ndim);
+  sscanf(argv[4], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[5], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[6], "%lf", &w);
+  N3 = (BIGINT)w;
+  sscanf(argv[7], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 8) sscanf(argv[8], "%lf", &tol);
+  if (argc > 9) sscanf(argv[9], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 10) sscanf(argv[10], "%d", &opts.spread_thread);
+  if (argc > 11) sscanf(argv[11], "%d", &opts.maxbatchsize);
+  if (argc > 12) sscanf(argv[12], "%d", &opts.spread_sort);
+  if (argc > 13) {
+    sscanf(argv[13], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+
+  // Allocate and initialize input -------------------------------------------
   cout << scientific << setprecision(15);
-  N2 = (N2 == 0) ? 1 : N2;
-  N3 = (N3 == 0) ? 1 : N3;  
-  BIGINT N = N1*N2*N3;
-  
-  FLT* s = NULL;
-  FLT* t = NULL; 
-  FLT* u = NULL;
-  if (type == 3) {   // make target freq NU pts for type 3 (N of them)...
-    s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-    FLT S1 = (FLT)N1/2;            
+  N2       = (N2 == 0) ? 1 : N2;
+  N3       = (N3 == 0) ? 1 : N3;
+  BIGINT N = N1 * N2 * N3;
+
+  FLT *s = NULL;
+  FLT *t = NULL;
+  FLT *u = NULL;
+  if (type == 3) { // make target freq NU pts for type 3 (N of them)...
+    s      = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+    FLT S1 = (FLT)N1 / 2;
 #pragma omp parallel
     {
-      unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-      for (BIGINT k=0; k<N; ++k) {
-      s[k] = S1*(1.7 + randm11r(&se));    // note the offset, to test type 3.
-      }      
-      if(ndim > 1) {
-        t = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-        FLT S2 = (FLT)N2/2;
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-        for (BIGINT k=0; k<N; ++k) {
-          t[k] = S2*(-0.5 + randm11r(&se));  
+      unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+      for (BIGINT k = 0; k < N; ++k) {
+        s[k] = S1 * (1.7 + randm11r(&se)); // note the offset, to test type 3.
+      }
+      if (ndim > 1) {
+        t      = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+        FLT S2 = (FLT)N2 / 2;
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+        for (BIGINT k = 0; k < N; ++k) {
+          t[k] = S2 * (-0.5 + randm11r(&se));
         }
-      }      
-      if(ndim > 2) {
-        u = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (3-cmpt)
-        FLT S3 = (FLT)N3/2;
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-        for (BIGINT k=0; k<N; ++k) {
-          u[k] = S3*(0.9 + randm11r(&se));  
+      }
+      if (ndim > 2) {
+        u      = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (3-cmpt)
+        FLT S3 = (FLT)N3 / 2;
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+        for (BIGINT k = 0; k < N; ++k) {
+          u[k] = S3 * (0.9 + randm11r(&se));
         }
       }
     }
   }
-  
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf);   // mode ampls  
-
-  FLT *x = (FLT *)malloc(sizeof(FLT)*M), *y=NULL, *z=NULL;  // NU pts x coords
-  if(ndim > 1)
-    y = (FLT *)malloc(sizeof(FLT)*M);        // NU pts y coords
-  if(ndim > 2)
-    z = (FLT *)malloc(sizeof(FLT)*M);        // NU pts z coords
+
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf);             // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf);             // mode ampls
+
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M), *y = NULL, *z = NULL; // NU pts x coords
+  if (ndim > 1) y = (FLT *)malloc(sizeof(FLT) * M);              // NU pts y coords
+  if (ndim > 2) z = (FLT *)malloc(sizeof(FLT) * M);              // NU pts z coords
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      if(y)
-	y[j] = M_PI*randm11r(&se);
-      if(z)
-	z[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      if (y) y[j] = M_PI * randm11r(&se);
+      if (z) z[j] = M_PI * randm11r(&se);
     }
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-    for(BIGINT i = 0; i<ntransf*M; i++)       // random strengths
-	c[i] = crandm11r(&se);
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+    for (BIGINT i = 0; i < ntransf * M; i++) // random strengths
+      c[i] = crandm11r(&se);
   }
 
   // Andrea found the following are needed to get reliable independent timings:
   FFTW_CLEANUP();
   FFTW_CLEANUP_THREADS();
   FFTW_FORGET_WISDOM();
-  //std::this_thread::sleep_for(std::chrono::seconds(1));
+  // std::this_thread::sleep_for(std::chrono::seconds(1));
   sleep(tsleep);
 
-  printf("FINUFFT %dd%d use guru interface to do %d calls together:-------------------\n",ndim,type,ntransf);
-  FINUFFT_PLAN plan;                  // instantiate a finufft_plan
-  finufft::utils::CNTime timer; timer.start();        // Guru Step 1
-  BIGINT n_modes[3] = {N1,N2,N3};     // #modes per dimension (ignored for t3)
+  printf("FINUFFT %dd%d use guru interface to do %d calls together:-------------------\n",
+         ndim, type, ntransf);
+  FINUFFT_PLAN plan;                // instantiate a finufft_plan
+  finufft::utils::CNTime timer;
+  timer.start();                    // Guru Step 1
+  BIGINT n_modes[3] = {N1, N2, N3}; // #modes per dimension (ignored for t3)
   int ier = FINUFFT_MAKEPLAN(type, ndim, n_modes, isign, ntransf, tol, &plan, &opts);
   // (NB: the opts struct can no longer be modified with effect!)
   double plan_t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else {
-    if (type!=3)
+    if (type != 3)
       printf("\tplan, for %lld modes: \t\t%.3g s\n", (long long)N, plan_t);
     else
       printf("\tplan:\t\t\t\t\t%.3g s\n", plan_t);
   }
-  
-  timer.restart();                    // Guru Step 2
-  ier = FINUFFT_SETPTS(plan, M, x, y, z, N, s, t, u); //(t1,2: N,s,t,u ignored)
+
+  timer.restart();                                              // Guru Step 2
+  ier           = FINUFFT_SETPTS(plan, M, x, y, z, N, s, t, u); //(t1,2: N,s,t,u ignored)
   double sort_t = timer.elapsedsec();
   if (ier) {
-    printf("error (ier=%d)!\n",ier);
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else {
-    if (type!=3)
+    if (type != 3)
       printf("\tsetpts for %lld NU pts: \t\t%.3g s\n", (long long)M, sort_t);
     else
-      printf("\tsetpts for %lld + %lld NU pts: \t%.3g s\n", (long long)M, (long long)N, sort_t);
+      printf("\tsetpts for %lld + %lld NU pts: \t%.3g s\n", (long long)M, (long long)N,
+             sort_t);
   }
-  
-  timer.restart();                     // Guru Step 3
-  ier = FINUFFT_EXECUTE(plan,c,F);
-  double exec_t=timer.elapsedsec();
+
+  timer.restart(); // Guru Step 3
+  ier           = FINUFFT_EXECUTE(plan, c, F);
+  double exec_t = timer.elapsedsec();
   if (ier) {
-    printf("error (ier=%d)!\n",ier);
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
     printf("\texec \t\t\t\t\t%.3g s\n", exec_t);
 
   double totalTime = plan_t + sort_t + exec_t;
-  if (type!=3)
-    printf("ntr=%d: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf, (long long)M,(long long)N, totalTime, ntransf*M/totalTime);
+  if (type != 3)
+    printf("ntr=%d: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N, totalTime, ntransf * M / totalTime);
   else
-    printf("ntr=%d: %lld NU pts to %lld NU pts in %.3g s \t%.3g tot NU pts/s\n", ntransf, (long long)M,(long long)N, totalTime, ntransf*(N+M)/totalTime);
+    printf("ntr=%d: %lld NU pts to %lld NU pts in %.3g s \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, totalTime, ntransf * (N + M) / totalTime);
 
   // Comparing timing results with repeated calls to corresponding finufft function...
 
@@ -199,40 +207,38 @@ int main(int argc, char* argv[])
   // by Andrea Malleo, but in this case we need to access the plan later
   // for many_simple_calls() to work, so we cannot do FFTW cleanup without
   // apparently causing segfault :(. So we skip them.
-  //FFTW_CLEANUP();
-  //FFTW_CLEANUP_THREADS();
-  //FFTW_FORGET_WISDOM();
-  
-  //std::this_thread::sleep_for(std::chrono::seconds(1)); if c++11 is allowed
-  sleep(tsleep); //sleep for one second using linux sleep call
-  
-  
-  printf("Compare speed of repeated calls to simple interface:------------------------\n");
+  // FFTW_CLEANUP();
+  // FFTW_CLEANUP_THREADS();
+  // FFTW_FORGET_WISDOM();
+
+  // std::this_thread::sleep_for(std::chrono::seconds(1)); if c++11 is allowed
+  sleep(tsleep); // sleep for one second using linux sleep call
+
+  printf(
+      "Compare speed of repeated calls to simple interface:------------------------\n");
   // this used to actually call Alex's old (v1.1) src/finufft?d.cpp routines.
   // Since we don't want to ship those, we now call the simple interfaces.
-  
-  double simpleTime = many_simple_calls(c,F, x, y, z, plan);
-  if (isnan(simpleTime))
-    return 1;
-  
-  if (type!=3)
-    printf("%d of:\t%lld NU pts to %lld modes in %.3g s   \t%.3g NU pts/s\n",
-           ntransf,(long long)M,(long long)N, simpleTime, ntransf*M/simpleTime);
+
+  double simpleTime = many_simple_calls(c, F, x, y, z, plan);
+  if (isnan(simpleTime)) return 1;
+
+  if (type != 3)
+    printf("%d of:\t%lld NU pts to %lld modes in %.3g s   \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N, simpleTime, ntransf * M / simpleTime);
   else
-    printf("%d of:\t%lld NU pts to %lld NU pts in %.3g s  \t%.3g tot NU pts/s\n",
-           ntransf,(long long)M,(long long)N, simpleTime, ntransf*(M+N)/simpleTime);
-  printf("\tspeedup \t T_finufft%dd%d_simple / T_finufft%dd%d = %.3g\n",ndim,type,
-         ndim, type, simpleTime/totalTime);
+    printf("%d of:\t%lld NU pts to %lld NU pts in %.3g s  \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, simpleTime, ntransf * (M + N) / simpleTime);
+  printf("\tspeedup \t T_finufft%dd%d_simple / T_finufft%dd%d = %.3g\n", ndim, type, ndim,
+         type, simpleTime / totalTime);
 
-  
-  FINUFFT_DESTROY(plan);              // Guru Step 4
+  FINUFFT_DESTROY(plan); // Guru Step 4
   // (must be done *after* many_simple_calls, which sneaks a look at the plan!)
   // however, segfaults, maybe because plan->opts.debug changed?
-  
+
   //---------------------------- Free Memory (no need to test if NULL)
   free(F);
   free(c);
-  free(x); 
+  free(x);
   free(y);
   free(z);
   free(s);
@@ -241,7 +247,6 @@ int main(int argc, char* argv[])
   return 0;
 }
 
-
 // -------------------------------- HELPER FUNCS ----------------------------
 
 double finufftFunnel(CPX *cStart, CPX *fStart, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan)
@@ -253,156 +258,161 @@ double finufftFunnel(CPX *cStart, CPX *fStart, FLT *x, FLT *y, FLT *z, FINUFFT_P
    Malleo 2019; xyz passed in by Barnett 5/26/20 to prevent X_orig fields.
 */
 {
-  finufft::utils::CNTime timer; timer.start();
-  int ier = 0;
-  double t = 0;
-  double fail = NAN;                  // dummy code for failure
-  finufft_opts* popts = &(plan->opts);   // opts ptr, as v1.2 simple calls need
-  switch (plan->dim){
-    
-  case 1:                    // 1D
-    switch (plan->type){
+  finufft::utils::CNTime timer;
+  timer.start();
+  int ier             = 0;
+  double t            = 0;
+  double fail         = NAN;           // dummy code for failure
+  finufft_opts *popts = &(plan->opts); // opts ptr, as v1.2 simple calls need
+  switch (plan->dim) {
+
+  case 1: // 1D
+    switch (plan->type) {
 
     case 1:
       timer.restart();
-      ier = FINUFFT1D1(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT1D1(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart,
+                       popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 2:
       timer.restart();
-      ier = FINUFFT1D2(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT1D2(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart,
+                       popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 3:
       timer.restart();
-      ier = FINUFFT1D3(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->nk, plan->S, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT1D3(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->nk, plan->S,
+                       fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     default:
-      return fail; 
+      return fail;
     }
 
-  case 2:                    // 2D
-    switch(plan->type){
-      
+  case 2: // 2D
+    switch (plan->type) {
+
     case 1:
       timer.restart();
-      ier = FINUFFT2D1(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->ms, plan->mt, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT2D1(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->ms,
+                       plan->mt, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 2:
       timer.restart();
-      ier = FINUFFT2D2(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->ms, plan->mt,
-     		       fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT2D2(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->ms,
+                       plan->mt, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
+        return t;
 
     case 3:
       timer.restart();
-      ier = FINUFFT2D3(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->nk, plan->S, plan->T,
-                       fStart, popts); 
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT2D3(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->nk,
+                       plan->S, plan->T, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     default:
       return fail;
     }
 
-  case 3:                    // 3D
-    switch(plan->type){
+  case 3: // 3D
+    switch (plan->type) {
 
     case 1:
       timer.restart();
-      ier = FINUFFT3D1(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol,
-                       plan->ms, plan->mt, plan->mu, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT3D1(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->ms,
+                       plan->mt, plan->mu, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 2:
       timer.restart();
-      ier = FINUFFT3D2(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol,
-                       plan->ms, plan->mt, plan->mu, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT3D2(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->ms,
+                       plan->mt, plan->mu, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 3:
       timer.restart();
-      ier = FINUFFT3D3(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol,
-                       plan->nk, plan->S, plan->T, plan->U, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT3D3(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->nk,
+                       plan->S, plan->T, plan->U, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
+        return t;
 
-    default:                   // invalid type
+    default: // invalid type
       return fail;
     }
 
-  default:                     // invalid dimension
+  default: // invalid dimension
     return fail;
   }
 }
 
-double many_simple_calls(CPX *c,CPX *F, FLT* x, FLT* y, FLT* z, FINUFFT_PLAN plan)
+double many_simple_calls(CPX *c, CPX *F, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan)
 /* A unified interface to all of the simple interfaces, with a loop over
    many such transforms. Returns total time reported by the transforms.
    (Used to call pre-v1.2 single implementations in finufft, via runOldFinufft.
    The repo no longer contains those implementations, which used to be in a
    subdirectory.)
 */
-{  
-    CPX *cStart;
-    CPX *fStart;
-
-    double time = 0;
-    double temp = 0;;
-    
-    for(int k = 0; k < plan->ntrans; k++){
-      cStart = c + plan->nj*k;
-      fStart = F + plan->ms*plan->mt*plan->mu*k;
-      
-      //printf("k=%d, debug=%d.................\n",k, plan->opts.debug);      
-      if(k != 0) {                     // prevent massive debug output
-	plan->opts.debug = 0;
-	plan->opts.spread_debug = 0;
-      }
-        
-      temp = finufftFunnel(cStart,fStart, x, y,z,plan);
-      if (isnan(temp)) {
-	fprintf(stderr,"[%s] Funnel call to finufft failed!\n",__func__); 
-        return NAN;
-      }
-      else
-	time += temp;
+{
+  CPX *cStart;
+  CPX *fStart;
+
+  double time = 0;
+  double temp = 0;
+  ;
+
+  for (int k = 0; k < plan->ntrans; k++) {
+    cStart = c + plan->nj * k;
+    fStart = F + plan->ms * plan->mt * plan->mu * k;
+
+    // printf("k=%d, debug=%d.................\n",k, plan->opts.debug);
+    if (k != 0) { // prevent massive debug output
+      plan->opts.debug        = 0;
+      plan->opts.spread_debug = 0;
     }
-    return time;
+
+    temp = finufftFunnel(cStart, fStart, x, y, z, plan);
+    if (isnan(temp)) {
+      fprintf(stderr, "[%s] Funnel call to finufft failed!\n", __func__);
+      return NAN;
+    } else
+      time += temp;
+  }
+  return time;
 }
diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp
index c6776cf0e..0f2c9d0bb 100644
--- a/perftest/manysmallprobs.cpp
+++ b/perftest/manysmallprobs.cpp
@@ -10,14 +10,14 @@ using namespace finufft::utils;
 #include <stdlib.h>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* What is small-problem cost of FINUFFT library from C++, using plain
    arrays of C++ complex numbers?  Barnett 10/31/17.
    for Xi Chen question. Updated to also demo guru interface and compare speed.
    6/7/22 made deterministic changes so check answer matches both ways.
 
-   g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs  -lfftw3 -lfftw3_omp -lm
-   # multithreaded is much slower, due to overhead of starting threads?...
+   g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs  -lfftw3
+   -lfftw3_omp -lm # multithreaded is much slower, due to overhead of starting threads?...
    export OMP_NUM_THREADS=1
    time ./manysmallprobs
 
@@ -26,54 +26,64 @@ int main(int argc, char* argv[])
 
    But why is multi-thread so much slower? (thread start-up time?)
 */
-{  
-  int M = 2e2;            // number of nonuniform points
-  int N = 2e2;            // number of modes
-  int reps = 2e4;         // how many repetitions
-  double acc = 1e-6;      // desired accuracy
-  
-  complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
+{
+  int M      = 2e2;                              // number of nonuniform points
+  int N      = 2e2;                              // number of modes
+  int reps   = 2e4;                              // how many repetitions
+  double acc = 1e-6;                             // desired accuracy
+
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
   int ier;
-  
+
   // generate some random nonuniform points (x) and complex strengths (c):
-  double *x = (double *)malloc(sizeof(double)*M);
-  complex<double>* c = (complex<double>*)malloc(sizeof(complex<double>)*M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi]
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  double *x          = (double *)malloc(sizeof(double) * M);
+  complex<double> *c = (complex<double> *)malloc(sizeof(complex<double>) * M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi]
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
   // allocate output array for the Fourier modes:
-  complex<double>* F = (complex<double>*)malloc(sizeof(complex<double>)*N);
+  complex<double> *F = (complex<double> *)malloc(sizeof(complex<double>) * N);
 
   printf("repeatedly calling the simple interface: --------------------- \n");
-  finufft::utils::CNTime timer; timer.start();
-  for (int r=0;r<reps;++r) {    // call the NUFFT (with iflag=+1):
-    //printf("rep %d\n",r);
-    x[0] = M_PI*(-1.0 + 2*(double)r/(double)reps);   // one source jiggles around
-    c[0] = (1.0 + I) * (double)r/(double)reps;       // one coeff also jiggles
-    ier = finufft1d1(M,x,c,+1,acc,N,F,NULL);
+  finufft::utils::CNTime timer;
+  timer.start();
+  for (int r = 0; r < reps; ++r) { // call the NUFFT (with iflag=+1):
+    // printf("rep %d\n",r);
+    x[0] = M_PI * (-1.0 + 2 * (double)r / (double)reps); // one source jiggles around
+    c[0] = (1.0 + I) * (double)r / (double)reps;         // one coeff also jiggles
+    ier  = finufft1d1(M, x, c, +1, acc, N, F, NULL);
   }
   // (note this can't use the many-vectors interface since the NU change)
-  complex<double> y=F[0];    // actually use the data so not optimized away
-  printf("%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",reps,timer.elapsedsec(),reps*M/timer.elapsedsec(),ier,real(y),imag(y));
+  complex<double> y = F[0]; // actually use the data so not optimized away
+  printf(
+      "%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",
+      reps, timer.elapsedsec(), reps * M / timer.elapsedsec(), ier, real(y), imag(y));
 
   printf("repeatedly executing via the guru interface: -------------------\n");
   timer.restart();
-  finufft_plan plan; finufft_opts opts; finufft_default_opts(&opts);
-  opts.debug = 0;
-  int64_t Ns[]={N,1,1};
-  int ntransf = 1;    // since we do one at a time (neq reps)
-  finufft_makeplan(1,1,Ns,+1,ntransf,acc,&plan,&opts);
-  for (int r=0;r<reps;++r) {    // set the pts and execute
-    x[0] = M_PI*(-1.0 + 2*(double)r/(double)reps);   // one source jiggles around
+  finufft_plan plan;
+  finufft_opts opts;
+  finufft_default_opts(&opts);
+  opts.debug   = 0;
+  int64_t Ns[] = {N, 1, 1};
+  int ntransf  = 1;                // since we do one at a time (neq reps)
+  finufft_makeplan(1, 1, Ns, +1, ntransf, acc, &plan, &opts);
+  for (int r = 0; r < reps; ++r) { // set the pts and execute
+    x[0] = M_PI * (-1.0 + 2 * (double)r / (double)reps); // one source jiggles around
     // (of course if most sources *were* in fact fixed, use ZGEMM for them!)
     finufft_setpts(plan, M, x, NULL, NULL, 0, NULL, NULL, NULL);
-    c[0] = (1.0 + I) * (double)r/(double)reps;       // one coeff also jiggles
-    ier = finufft_execute(plan, c, F);
+    c[0] = (1.0 + I) * (double)r / (double)reps; // one coeff also jiggles
+    ier  = finufft_execute(plan, c, F);
   }
   finufft_destroy(plan);
   y = F[0];
-  printf("%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",reps,timer.elapsedsec(),reps*M/timer.elapsedsec(),ier,real(y),imag(y));
-  free(x); free(c); free(F);
+  printf(
+      "%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",
+      reps, timer.elapsedsec(), reps * M / timer.elapsedsec(), ier, real(y), imag(y));
+  free(x);
+  free(c);
+  free(F);
   return ier;
 }
diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp
index ab345035c..9b560a25e 100644
--- a/perftest/spreadtestnd.cpp
+++ b/perftest/spreadtestnd.cpp
@@ -1,22 +1,28 @@
-#include <finufft/spreadinterp.h>
 #include <finufft/defs.h>
+#include <finufft/spreadinterp.h>
 #include <finufft/utils.h>
 #include <finufft/utils_precindep.h>
 
-#include <vector>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
 using namespace finufft::spreadinterp;
-using namespace finufft::utils;              // for timer
+using namespace finufft::utils; // for timer
 
-void usage()
-{
-  printf("usage: spreadtestnd dims [M N [tol [sort [flags [debug [kerpad [kerevalmeth [upsampfac]]]]]]]]\n\twhere dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform pts\n\ttol=requested accuracy\n\tsort=0 (don't sort NU pts), 1 (do), or 2 (maybe sort; default)\n\tflags: expert timing flags, 0 is default (see spreadinterp.h)\n\tdebug=0 (less text out), 1 (more), 2 (lots)\n\tkerpad=0 (no pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), 1 (Horner ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestnd 1 1e6 1e6 1e-6 2 0 1\n");
+void usage() {
+  printf("usage: spreadtestnd dims [M N [tol [sort [flags [debug [kerpad [kerevalmeth "
+         "[upsampfac]]]]]]]]\n\twhere dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform "
+         "pts\n\ttol=requested accuracy\n\tsort=0 (don't sort NU pts), 1 (do), or 2 "
+         "(maybe sort; default)\n\tflags: expert timing flags, 0 is default (see "
+         "spreadinterp.h)\n\tdebug=0 (less text out), 1 (more), 2 (lots)\n\tkerpad=0 (no "
+         "pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), 1 "
+         "(Horner ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestnd "
+         "1 1e6 1e6 1e-6 2 0 1\n");
 }
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Test executable for the 1D, 2D, or 3D C++ spreader, both directions.
  * It checks speed, and basic correctness via the grid sum of the result.
  * See usage() for usage.  Note it currently tests only pirange=0, which is not
@@ -25,7 +31,8 @@ int main(int argc, char* argv[])
  * Example: spreadtestnd 3 8e6 8e6 1e-6 2 0 1
  *
  * Compilation (also check ../makefile):
- *    g++ spreadtestnd.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestnd -fPIC -Ofast -funroll-loops -fopenmp
+ *    g++ spreadtestnd.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestnd -fPIC
+ * -Ofast -funroll-loops -fopenmp
  *
  * Magland; expanded by Barnett 1/14/17. Better cmd line args 3/13/17
  * indep setting N 3/27/17. parallel rand() & sort flag 3/28/17
@@ -34,192 +41,258 @@ int main(int argc, char* argv[])
  * Barbone, removed pirange 05/09/24.
  */
 {
-  int d = 3;            // Cmd line args & their defaults:  default #dims
-  double w, tol = 1e-6; // default (eg 1e-6 has nspread=7)
-  BIGINT M = 1e6;       // default # NU pts
-  BIGINT roughNg = 1e6; // default # U pts
-  int sort = 2;         // spread_sort
-  int flags = 0;        // default
-  int debug = 0;        // default
-  int kerpad = 0;       // default
-  int kerevalmeth = 1;  // default: Horner
-  FLT upsampfac = 2.0;  // standard
-  
-  if (argc<2 || argc==3 || argc>11) {
-    usage(); return (argc>1);
-  }
-  sscanf(argv[1],"%d",&d);
-  if (d<1 || d>3) {
-    printf("d must be 1, 2 or 3!\n"); usage(); return 1;
-  }
-  if (argc>2) {
-    sscanf(argv[2],"%lf",&w); M = (BIGINT)w;       // to read "1e6" right!
-    if (M<1) {
-      printf("M (# NU pts) must be positive!\n"); usage(); return 1;
+  int d = 3;             // Cmd line args & their defaults:  default #dims
+  double w, tol = 1e-6;  // default (eg 1e-6 has nspread=7)
+  BIGINT M        = 1e6; // default # NU pts
+  BIGINT roughNg  = 1e6; // default # U pts
+  int sort        = 2;   // spread_sort
+  int flags       = 0;   // default
+  int debug       = 0;   // default
+  int kerpad      = 0;   // default
+  int kerevalmeth = 1;   // default: Horner
+  FLT upsampfac   = 2.0; // standard
+
+  if (argc < 2 || argc == 3 || argc > 11) {
+    usage();
+    return (argc > 1);
+  }
+  sscanf(argv[1], "%d", &d);
+  if (d < 1 || d > 3) {
+    printf("d must be 1, 2 or 3!\n");
+    usage();
+    return 1;
+  }
+  if (argc > 2) {
+    sscanf(argv[2], "%lf", &w);
+    M = (BIGINT)w; // to read "1e6" right!
+    if (M < 1) {
+      printf("M (# NU pts) must be positive!\n");
+      usage();
+      return 1;
     }
-    sscanf(argv[3],"%lf",&w); roughNg = (BIGINT)w;
-    if (roughNg<1) {
-      printf("N (# U pts) must be positive!\n"); usage(); return 1;
+    sscanf(argv[3], "%lf", &w);
+    roughNg = (BIGINT)w;
+    if (roughNg < 1) {
+      printf("N (# U pts) must be positive!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>4) sscanf(argv[4],"%lf",&tol);
-  if (argc>5) {
-    sscanf(argv[5],"%d",&sort);
-    if ((sort!=0) && (sort!=1) && (sort!=2)) {
-      printf("sort must be 0, 1 or 2!\n"); usage(); return 1;
+  if (argc > 4) sscanf(argv[4], "%lf", &tol);
+  if (argc > 5) {
+    sscanf(argv[5], "%d", &sort);
+    if ((sort != 0) && (sort != 1) && (sort != 2)) {
+      printf("sort must be 0, 1 or 2!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>6)
-    sscanf(argv[6],"%d",&flags);
-  if (argc>7) {
-    sscanf(argv[7],"%d",&debug);
-    if ((debug<0) || (debug>2)) {
-      printf("debug must be 0, 1 or 2!\n"); usage(); return 1;
+  if (argc > 6) sscanf(argv[6], "%d", &flags);
+  if (argc > 7) {
+    sscanf(argv[7], "%d", &debug);
+    if ((debug < 0) || (debug > 2)) {
+      printf("debug must be 0, 1 or 2!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>8) {
-    sscanf(argv[8],"%d",&kerpad);
-    if ((kerpad<0) || (kerpad>1)) {
-      printf("kerpad must be 0 or 1!\n"); usage(); return 1;
+  if (argc > 8) {
+    sscanf(argv[8], "%d", &kerpad);
+    if ((kerpad < 0) || (kerpad > 1)) {
+      printf("kerpad must be 0 or 1!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>9) {
-    sscanf(argv[9],"%d",&kerevalmeth);
-    if ((kerevalmeth<0) || (kerevalmeth>1)) {
-      printf("kerevalmeth must be 0 or 1!\n"); usage(); return 1;
+  if (argc > 9) {
+    sscanf(argv[9], "%d", &kerevalmeth);
+    if ((kerevalmeth < 0) || (kerevalmeth > 1)) {
+      printf("kerevalmeth must be 0 or 1!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>10) {
-    sscanf(argv[10],"%lf",&w); upsampfac = (FLT)w;
-    if (upsampfac<=1.0) {
-      printf("upsampfac must be >1.0!\n"); usage(); return 1;
+  if (argc > 10) {
+    sscanf(argv[10], "%lf", &w);
+    upsampfac = (FLT)w;
+    if (upsampfac <= 1.0) {
+      printf("upsampfac must be >1.0!\n");
+      usage();
+      return 1;
     }
   }
 
-  int dodir1 = true;                        // control if dir=1 tested at all
-  BIGINT N = (BIGINT)round(pow(roughNg,1.0/d));     // Fourier grid size per dim
-  BIGINT Ng = (BIGINT)pow(N,d);                     // actual total grid points
-  BIGINT N2 = (d>=2) ? N : 1, N3 = (d==3) ? N : 1;  // the y and z grid sizes
-  std::vector<FLT> kx(M),ky(1),kz(1),d_nonuniform(2*M);    // NU, Re & Im
-  if (d>1) ky.resize(M);                           // only alloc needed coords
-  if (d>2) kz.resize(M);
-  std::vector<FLT> d_uniform(2*Ng);                        // Re and Im
+  int dodir1 = true;                                   // control if dir=1 tested at all
+  BIGINT N   = (BIGINT)round(pow(roughNg, 1.0 / d));   // Fourier grid size per dim
+  BIGINT Ng  = (BIGINT)pow(N, d);                      // actual total grid points
+  BIGINT N2 = (d >= 2) ? N : 1, N3 = (d == 3) ? N : 1; // the y and z grid sizes
+  std::vector<FLT> kx(M), ky(1), kz(1), d_nonuniform(2 * M); // NU, Re & Im
+  if (d > 1) ky.resize(M);                                   // only alloc needed coords
+  if (d > 2) kz.resize(M);
+  std::vector<FLT> d_uniform(2 * Ng);                        // Re and Im
 
   finufft_spread_opts opts;
-  int ier_set = setup_spreader(opts,(FLT)tol,upsampfac,kerevalmeth,debug,1,d);
-  if (ier_set>1) {       // exit gracefully if can't set up.
-    printf("error when setting up spreader (ier_set=%d)!\n",ier_set);
+  int ier_set = setup_spreader(opts, (FLT)tol, upsampfac, kerevalmeth, debug, 1, d);
+  if (ier_set > 1) { // exit gracefully if can't set up.
+    printf("error when setting up spreader (ier_set=%d)!\n", ier_set);
     return ier_set;
   }
-  opts.debug = debug;   // print more diagnostics?
-  opts.sort = sort;
-  opts.flags = flags;
-  opts.kerpad = kerpad;
-  opts.upsampfac = upsampfac;
-  opts.nthreads = 0;  // max # threads used, or 0 to use what's avail
+  opts.debug        = debug; // print more diagnostics?
+  opts.sort         = sort;
+  opts.flags        = flags;
+  opts.kerpad       = kerpad;
+  opts.upsampfac    = upsampfac;
+  opts.nthreads     = 0; // max # threads used, or 0 to use what's avail
   opts.sort_threads = 0;
-  //opts.max_subproblem_size = 1e5;
+  // opts.max_subproblem_size = 1e5;
   FLT maxerr, ansmod;
-  
+
   // spread a single source, only for reference accuracy check...
-  opts.spread_direction=1;
-  d_nonuniform[0] = 1.0; d_nonuniform[1] = 0.0;   // unit strength
-  kx[0] = ky[0] = kz[0] = 0.0;                    // at center (probably doesn't matter); domain is [-pi,pi)^d
-  int ier = spreadinterp(N,N2,N3,d_uniform.data(),1,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts);          // vector::data officially C++11 but works
-  if (ier!=0) {
-    printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n",ier);
+  opts.spread_direction = 1;
+  d_nonuniform[0]       = 1.0;
+  d_nonuniform[1]       = 0.0; // unit strength
+  kx[0] = ky[0] = kz[0] = 0.0; // at center (probably doesn't matter); domain is
+                               // [-pi,pi)^d
+  int ier = spreadinterp(N,
+                         N2,
+                         N3,
+                         d_uniform.data(),
+                         1,
+                         kx.data(),
+                         ky.data(),
+                         kz.data(),
+                         d_nonuniform.data(),
+                         opts); // vector::data officially C++11 but works
+  if (ier != 0) {
+    printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n", ier);
     return ier;
   }
-  FLT kersumre = 0.0, kersumim = 0.0;  // sum kernel on uniform grid
-  for (BIGINT i=0;i<Ng;++i) {
-    kersumre += d_uniform[2*i]; 
-    kersumim += d_uniform[2*i+1];    // in case the kernel isn't real!
+  FLT kersumre = 0.0, kersumim = 0.0; // sum kernel on uniform grid
+  for (BIGINT i = 0; i < Ng; ++i) {
+    kersumre += d_uniform[2 * i];
+    kersumim += d_uniform[2 * i + 1]; // in case the kernel isn't real!
   }
 
   // now do the large-scale test w/ random sources..
   printf("making random data...\n");
-  FLT strre = 0.0, strim = 0.0;          // also sum the strengths
+  FLT strre = 0.0, strim = 0.0; // also sum the strengths
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,1000000) reduction(+:strre,strim)
-    for (BIGINT i=0; i<M; ++i) {
-      kx[i]=randm11r(&se)*3*M_PI;
-      //kx[i]=2.0*kx[i] - 50.0;      //// to test folding within +-1 period
-      if (d>1) ky[i]=randm11r(&se)*3*M_PI;      // only fill needed coords
-      if (d>2) kz[i]=randm11r(&se)*3*M_PI;
-      d_nonuniform[i*2]=randm11r(&se);
-      d_nonuniform[i*2+1]=randm11r(&se);
-      strre += d_nonuniform[2*i]; 
-      strim += d_nonuniform[2*i+1];
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, 1000000) reduction(+ : strre, strim)
+    for (BIGINT i = 0; i < M; ++i) {
+      kx[i] = randm11r(&se) * 3 * M_PI;
+      // kx[i]=2.0*kx[i] - 50.0;      //// to test folding within +-1 period
+      if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; // only fill needed coords
+      if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI;
+      d_nonuniform[i * 2]     = randm11r(&se);
+      d_nonuniform[i * 2 + 1] = randm11r(&se);
+      strre += d_nonuniform[2 * i];
+      strim += d_nonuniform[2 * i + 1];
     }
   }
   CNTime timer;
   double t;
-  if (dodir1) {   // test direction 1 (NU -> U spreading) ......................
-    printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",d,(double)Ng,opts.spread_direction,tol,opts.nspread);
+  if (dodir1) { // test direction 1 (NU -> U spreading) ......................
+    printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",
+           d,
+           (double)Ng,
+           opts.spread_direction,
+           tol,
+           opts.nspread);
     timer.start();
-    ier = spreadinterp(N,N2,N3,d_uniform.data(),M,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts);
-    t=timer.elapsedsec();
-    if (ier!=0) {
-      printf("error (ier=%d)!\n",ier);
+    ier = spreadinterp(N,
+                       N2,
+                       N3,
+                       d_uniform.data(),
+                       M,
+                       kx.data(),
+                       ky.data(),
+                       kz.data(),
+                       d_nonuniform.data(),
+                       opts);
+    t   = timer.elapsedsec();
+    if (ier != 0) {
+      printf("error (ier=%d)!\n", ier);
       return ier;
     } else
-      printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",(double)M,t,M/t,pow(opts.nspread,d)*M/t);
-  
-    FLT sumre = 0.0, sumim = 0.0;   // check spreading accuracy, wrapping
-#pragma omp parallel for reduction(+:sumre,sumim)
-    for (BIGINT i=0;i<Ng;++i) {
-      sumre += d_uniform[2*i]; 
-      sumim += d_uniform[2*i+1];
+      printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",
+             (double)M,
+             t,
+             M / t,
+             pow(opts.nspread, d) * M / t);
+
+    FLT sumre = 0.0, sumim = 0.0; // check spreading accuracy, wrapping
+#pragma omp parallel for reduction(+ : sumre, sumim)
+    for (BIGINT i = 0; i < Ng; ++i) {
+      sumre += d_uniform[2 * i];
+      sumim += d_uniform[2 * i + 1];
     }
-    FLT pre = kersumre*strre - kersumim*strim;   // pred ans, complex mult
-    FLT pim = kersumim*strre + kersumre*strim;
-    FLT maxerr = std::max(fabs(sumre-pre), fabs(sumim-pim));
-    FLT ansmod = sqrt(sumre*sumre+sumim*sumim);
-    printf("    rel err in total over grid:      %.3g\n",maxerr/ansmod);
+    FLT pre    = kersumre * strre - kersumim * strim; // pred ans, complex mult
+    FLT pim    = kersumim * strre + kersumre * strim;
+    FLT maxerr = std::max(fabs(sumre - pre), fabs(sumim - pim));
+    FLT ansmod = sqrt(sumre * sumre + sumim * sumim);
+    printf("    rel err in total over grid:      %.3g\n", maxerr / ansmod);
     // note this is weaker than below dir=2 test, but is good indicator that
     // periodic wrapping is correct
   }
 
   // test direction 2 (U -> NU interpolation) ..............................
   printf("making more random NU pts...\n");
-  for (BIGINT i=0;i<Ng;++i) {     // unit grid data
-    d_uniform[2*i] = 1.0;
-    d_uniform[2*i+1] = 0.0;
+  for (BIGINT i = 0; i < Ng; ++i) { // unit grid data
+    d_uniform[2 * i]     = 1.0;
+    d_uniform[2 * i + 1] = 0.0;
   }
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,1000000)
-      for (BIGINT i=0; i<M; ++i) {       // random target pts
-        //kx[i]=10+.9*rand01r(&s)*N;   // or if want to keep ns away from edges
-        kx[i]=randm11r(&se)*3*M_PI;
-        if (d>1) ky[i]=randm11r(&se)*3*M_PI;
-        if (d>2) kz[i]=randm11r(&se)*3*M_PI;
-      }
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, 1000000)
+    for (BIGINT i = 0; i < M; ++i) {           // random target pts
+      // kx[i]=10+.9*rand01r(&s)*N;   // or if want to keep ns away from edges
+      kx[i] = randm11r(&se) * 3 * M_PI;
+      if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI;
+      if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI;
+    }
   }
 
-  opts.spread_direction=2;
-  printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",d,(double)Ng,opts.spread_direction,tol,opts.nspread);
+  opts.spread_direction = 2;
+  printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",
+         d,
+         (double)Ng,
+         opts.spread_direction,
+         tol,
+         opts.nspread);
   timer.restart();
-  ier = spreadinterp(N,N2,N3,d_uniform.data(),M,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts);
-  t=timer.elapsedsec();
-  if (ier!=0) {
-    printf("error (ier=%d)!\n",ier);
+  ier = spreadinterp(N,
+                     N2,
+                     N3,
+                     d_uniform.data(),
+                     M,
+                     kx.data(),
+                     ky.data(),
+                     kz.data(),
+                     d_nonuniform.data(),
+                     opts);
+  t   = timer.elapsedsec();
+  if (ier != 0) {
+    printf("error (ier=%d)!\n", ier);
     return 1;
   } else
-    printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",(double)M,t,M/t,pow(opts.nspread,d)*M/t);
+    printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",
+           (double)M,
+           t,
+           M / t,
+           pow(opts.nspread, d) * M / t);
 
   // math test is worst-case error from pred value (kersum) on interp pts:
   maxerr = 0.0;
-  for (BIGINT i=0;i<M;++i) {
-    FLT err = std::max(fabs(d_nonuniform[2*i]-kersumre),
-		       fabs(d_nonuniform[2*i+1]-kersumim));
-    if (err>maxerr) maxerr=err;
+  for (BIGINT i = 0; i < M; ++i) {
+    FLT err = std::max(fabs(d_nonuniform[2 * i] - kersumre),
+                       fabs(d_nonuniform[2 * i + 1] - kersumim));
+    if (err > maxerr) maxerr = err;
   }
-  ansmod = sqrt(kersumre*kersumre+kersumim*kersumim);
-  printf("    max rel err in values at NU pts: %.3g\n",maxerr/ansmod);
+  ansmod = sqrt(kersumre * kersumre + kersumim * kersumim);
+  printf("    max rel err in values at NU pts: %.3g\n", maxerr / ansmod);
   // this is stronger test than for dir=1, since it tests sum of kernel for
   // each NU pt. However, it cannot detect reading
   // from wrong grid pts (they are all unity)
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 246a064f6..6bff9cb6e 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -16,8 +16,9 @@
 using namespace cufinufft::deconvolve;
 using namespace cufinufft::spreadinterp;
 
-template <typename T>
-int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     1D Type-1 NUFFT
 
@@ -31,43 +32,44 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 11/21/21
 */
 {
-    assert(d_plan->spopts.spread_direction == 1);
-    auto &stream = d_plan->stream;
-
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // this is needed
-        if ((ier = checkCudaErrors(
-                 cudaMemsetAsync(d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex<T>), stream))))
-            return ier;
-
-        // Step 1: Spread
-        if ((ier = cuspread1d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
-            return ier;
-    }
-
-    return 0;
+  assert(d_plan->spopts.spread_direction == 1);
+  auto &stream = d_plan->stream;
+
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize =
+        std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart   = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart  = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    // this is needed
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
+             stream))))
+      return ier;
+
+    // Step 1: Spread
+    if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cudeconvolve1d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
-template <typename T>
-int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     1D Type-2 NUFFT
 
@@ -81,41 +83,42 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 11/21/21
 */
 {
-    assert(d_plan->spopts.spread_direction == 2);
-
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
-
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cuinterp1d<T>(d_plan, blksize)))
-            return ier;
-    }
-
-    return 0;
+  assert(d_plan->spopts.spread_direction == 2);
+
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize =
+        std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart  = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
+
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+    if ((ier = cudeconvolve1d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
 template int cufinufft1d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft1d1_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft1d1_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
 template int cufinufft1d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft1d2_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft1d2_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu
index 0940f10de..cd3637c8b 100644
--- a/src/cuda/1d/interp1d_wrapper.cu
+++ b/src/cuda/1d/interp1d_wrapper.cu
@@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different interpolation methods.
@@ -26,58 +26,60 @@ int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 11/21/21
 */
 {
-    int nf1 = d_plan->nf1;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->opts.upsampfac;
-    int *d_idxnupts = d_plan->idxnupts;
-
-    T *d_kx = d_plan->kx;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 32;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+  auto &stream = d_plan->stream;
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int ns          = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c          = d_plan->spopts.ES_c;
+  T es_beta       = d_plan->spopts.ES_beta;
+  T sigma         = d_plan->opts.upsampfac;
+  int *d_idxnupts = d_plan->idxnupts;
+
+  T *d_kx               = d_plan->kx;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  threadsPerBlock.x = 32;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuinterp1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu
index c41ce0919..e72ade469 100644
--- a/src/cuda/1d/spread1d_wrapper.cu
+++ b/src/cuda/1d/spread1d_wrapper.cu
@@ -19,7 +19,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different spreading methods.
@@ -31,143 +31,52 @@ int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 11/21/21
 */
 {
-    int nf1 = d_plan->nf1;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuspread1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuspread1d_subprob<T>(nf1, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuspread1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuspread1d_subprob<T>(nf1, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
-
-    if (d_plan->opts.gpu_sort) {
-        int bin_size_x = d_plan->opts.gpu_binsizex;
-        if (bin_size_x < 0) {
-            std::cerr << "[cuspread1d_nuptsdriven_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n";
-            return FINUFFT_ERR_BINSIZE_NOTVALID;
-        }
-
-        int numbins = ceil((T)nf1 / bin_size_x);
-
-        T *d_kx = d_plan->kx;
-
-        int *d_binsize = d_plan->binsize;
-        int *d_binstartpts = d_plan->binstartpts;
-        int *d_sortidx = d_plan->sortidx;
-        int *d_idxnupts = d_plan->idxnupts;
-
-            int ier;
-        if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
-            return ier;
-        calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, nf1, bin_size_x, numbins, d_binsize,
-                                                                             d_kx, d_sortidx);
-        RETURN_IF_CUDA_ERROR
-
-        int n = numbins;
-        thrust::device_ptr<int> d_ptr(d_binsize);
-        thrust::device_ptr<int> d_result(d_binstartpts);
-        thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-        calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1);
-        RETURN_IF_CUDA_ERROR
-    } else {
-        int *d_idxnupts = d_plan->idxnupts;
-        trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts);
-        RETURN_IF_CUDA_ERROR
-    }
-
-    return 0;
-}
-
-template <typename T>
-int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    int *d_idxnupts = d_plan->idxnupts;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->spopts.upsampfac;
-
-    T *d_kx = d_plan->kx;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 16;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            spread_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    }
-
-    return 0;
-}
-
-template <typename T>
-int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
-/*
-    This function determines the properties for spreading that are independent
-    of the strength of the nodes,  only relates to the locations of the nodes,
-    which only needs to be done once.
-*/
-{
-    auto &stream = d_plan->stream;
-    int ier;
+  auto &stream = d_plan->stream;
 
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  if (d_plan->opts.gpu_sort) {
     int bin_size_x = d_plan->opts.gpu_binsizex;
     if (bin_size_x < 0) {
-        std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n";
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
+      std::cerr << "[cuspread1d_nuptsdriven_prop] error: invalid binsize (binsizex) = ("
+                << bin_size_x << ")\n";
+      return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
 
     int numbins = ceil((T)nf1 / bin_size_x);
 
     T *d_kx = d_plan->kx;
 
-    int *d_binsize = d_plan->binsize;
+    int *d_binsize     = d_plan->binsize;
     int *d_binstartpts = d_plan->binstartpts;
-    int *d_sortidx = d_plan->sortidx;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
-
-    int *d_subprob_to_bin = nullptr;
+    int *d_sortidx     = d_plan->sortidx;
+    int *d_idxnupts    = d_plan->idxnupts;
 
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
-        return ier;
-    calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, nf1, bin_size_x, numbins, d_binsize, d_kx,
-                                                                         d_sortidx);
+    int ier;
+    if ((ier = checkCudaErrors(
+             cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
+      return ier;
+    calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+        M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx);
     RETURN_IF_CUDA_ERROR
 
     int n = numbins;
@@ -178,101 +87,207 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
     calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
         M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1);
     RETURN_IF_CUDA_ERROR
-
-    calc_subprob_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize, numbins);
+  } else {
+    int *d_idxnupts = d_plan->idxnupts;
+    trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
+                                                                             d_idxnupts);
     RETURN_IF_CUDA_ERROR
+  }
 
-    d_ptr = thrust::device_pointer_cast(d_numsubprob);
-    d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
-        return ier;
+  return 0;
+}
 
-    int totalnumsubprob;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream))))
-        return ier;
-    cudaStreamSynchronize(stream);
-    if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
-        return ier;
-    map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(d_subprob_to_bin, d_subprobstartpts,
-                                                                            d_numsubprob, numbins);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_subprob_to_bin);
-        return FINUFFT_ERR_CUDA_FAILURE;
+template<typename T>
+int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+  auto &stream = d_plan->stream;
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int ns          = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  int *d_idxnupts = d_plan->idxnupts;
+  T es_c          = d_plan->spopts.ES_c;
+  T es_beta       = d_plan->spopts.ES_beta;
+  T sigma         = d_plan->spopts.upsampfac;
+
+  T *d_kx               = d_plan->kx;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  threadsPerBlock.x = 16;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      spread_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      spread_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
 
-    assert(d_subprob_to_bin != NULL);
-    cudaFreeAsync(d_plan->subprob_to_bin, stream);
-    d_plan->subprob_to_bin = d_subprob_to_bin;
-    d_plan->totalnumsubprob = totalnumsubprob;
-
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
+template<typename T>
+int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
+/*
+    This function determines the properties for spreading that are independent
+    of the strength of the nodes,  only relates to the locations of the nodes,
+    which only needs to be done once.
+*/
+{
+  auto &stream = d_plan->stream;
+  int ier;
 
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int bin_size_x     = d_plan->opts.gpu_binsizex;
+  if (bin_size_x < 0) {
+    std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = ("
+              << bin_size_x << ")\n";
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
 
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int numbins = ceil((T)nf1 / bin_size_x);
+  int numbins = ceil((T)nf1 / bin_size_x);
 
-    T *d_kx = d_plan->kx;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
+  T *d_kx = d_plan->kx;
 
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_sortidx         = d_plan->sortidx;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
 
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
+  int *d_subprob_to_bin = nullptr;
 
+  if ((ier =
+           checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
+    return ier;
+  calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx);
+  RETURN_IF_CUDA_ERROR
 
-    T sigma = d_plan->opts.upsampfac;
+  int n = numbins;
+  thrust::device_ptr<int> d_ptr(d_binsize);
+  thrust::device_ptr<int> d_result(d_binstartpts);
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
-    size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n";
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-    }
+  calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1);
+  RETURN_IF_CUDA_ERROR
+
+  calc_subprob_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob,
+                                                              maxsubprobsize, numbins);
+  RETURN_IF_CUDA_ERROR
+
+  d_ptr    = thrust::device_pointer_cast(d_numsubprob);
+  d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize,
-                bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_1d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize,
-                bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+  if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
+    return ier;
+
+  int totalnumsubprob;
+  if ((ier =
+           checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
+                                           sizeof(int), cudaMemcpyDeviceToHost, stream))))
+    return ier;
+  cudaStreamSynchronize(stream);
+  if ((ier = checkCudaErrors(
+           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+    return ier;
+  map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_subprob_to_bin);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  assert(d_subprob_to_bin != NULL);
+  cudaFreeAsync(d_plan->subprob_to_bin, stream);
+  d_plan->subprob_to_bin  = d_subprob_to_bin;
+  d_plan->totalnumsubprob = totalnumsubprob;
+
+  return 0;
+}
+
+template<typename T>
+int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int numbins    = ceil((T)nf1 / bin_size_x);
+
+  T *d_kx               = d_plan->kx;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int totalnumsubprob   = d_plan->totalnumsubprob;
+  int *d_subprob_to_bin = d_plan->subprob_to_bin;
+
+  T sigma = d_plan->opts.upsampfac;
+
+  size_t sharedplanorysize =
+      (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n";
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts,
+          d_numsubprob, maxsubprobsize, numbins, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      spread_1d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts,
+          d_numsubprob, maxsubprobsize, numbins, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuspread1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread1d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cuspread1d_nuptsdriven_prop<float>(int nf1, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread1d_nuptsdriven_prop<double>(int nf1, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread1d_subprob_prop<float>(int nf1, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread1d_subprob_prop<double>(int nf1, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread1d_nuptsdriven_prop<float>(int nf1, int M,
+                                                cufinufft_plan_t<float> *d_plan);
+template int cuspread1d_nuptsdriven_prop<double>(int nf1, int M,
+                                                 cufinufft_plan_t<double> *d_plan);
+template int cuspread1d_subprob_prop<float>(int nf1, int M,
+                                            cufinufft_plan_t<float> *d_plan);
+template int cuspread1d_subprob_prop<double>(int nf1, int M,
+                                             cufinufft_plan_t<double> *d_plan);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index b566f49ce..5f1fbd55c 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -15,8 +15,9 @@ using namespace cufinufft::deconvolve;
 using namespace cufinufft::spreadinterp;
 using std::min;
 
-template <typename T>
-int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     2D Type-1 NUFFT
 
@@ -30,44 +31,45 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    assert(d_plan->spopts.spread_direction == 1);
-
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-
-    auto &stream = d_plan->stream;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // this is needed
-        if ((ier = checkCudaErrors(cudaMemsetAsync(
-                 d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex<T>), stream))))
-            return ier;
-
-        // Step 1: Spread
-        if ((ier = cuspread2d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
-            return ier;
-    }
-
-    return 0;
+  assert(d_plan->spopts.spread_direction == 1);
+
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+
+  auto &stream = d_plan->stream;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
+    d_plan->c   = d_cstart;
+    d_plan->fk  = d_fkstart;
+
+    // this is needed
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_plan->fw, 0,
+             d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex<T>),
+             stream))))
+      return ier;
+
+    // Step 1: Spread
+    if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cudeconvolve2d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
-template <typename T>
-int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     2D Type-2 NUFFT
 
@@ -81,41 +83,41 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    assert(d_plan->spopts.spread_direction == 2);
-
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
-
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cuinterp2d<T>(d_plan, blksize)))
-            return ier;
-    }
-
-    return 0;
+  assert(d_plan->spopts.spread_direction == 2);
+
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
+
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+    if ((ier = cudeconvolve2d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
 template int cufinufft2d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft2d1_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft2d1_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
 template int cufinufft2d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft2d2_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft2d2_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu
index c62188e90..533788482 100644
--- a/src/cuda/2d/interp2d_wrapper.cu
+++ b/src/cuda/2d/interp2d_wrapper.cu
@@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different interpolation methods.
@@ -26,127 +26,130 @@ int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuinterp2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuinterp2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuinterp2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuinterp2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
-int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->opts.upsampfac;
-    
-    int *d_idxnupts = d_plan->idxnupts;
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 32;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            interp_2d_nupts_driven<T, 1>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2,
-                                                         es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            interp_2d_nupts_driven<T, 0>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2,
-                                                         es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+template<typename T>
+int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
+
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  T sigma   = d_plan->opts.upsampfac;
+
+  int *d_idxnupts = d_plan->idxnupts;
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  threadsPerBlock.x = 32;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      interp_2d_nupts_driven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      interp_2d_nupts_driven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int numbins[2];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    
-
-    T sigma = d_plan->opts.upsampfac;
-    size_t sharedplanorysize =
-        (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+template<typename T>
+int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int numbins[2];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+  int *d_subprob_to_bin  = d_plan->subprob_to_bin;
+  int totalnumsubprob    = d_plan->totalnumsubprob;
+
+  T sigma                  = d_plan->opts.upsampfac;
+  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
+                             (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
+
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
+          d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1],
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
-                numbins[0], numbins[1], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
-                numbins[0], numbins[1], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
+          d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1],
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuinterp2d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
index 3b27f7efd..f03e658d2 100644
--- a/src/cuda/2d/spread2d_wrapper.cu
+++ b/src/cuda/2d/spread2d_wrapper.cu
@@ -19,7 +19,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different spreading methods.
@@ -31,135 +31,40 @@ int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuspread2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuspread2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuspread2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuspread2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
-
-    if (d_plan->opts.gpu_sort) {
-        int bin_size_x = d_plan->opts.gpu_binsizex;
-        int bin_size_y = d_plan->opts.gpu_binsizey;
-        if (bin_size_x < 0 || bin_size_y < 0) {
-            std::cerr << "[cuspread2d_nuptsdriven_prop] error: invalid binsize (binsizex, binsizey) = (";
-            std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
-            return FINUFFT_ERR_BINSIZE_NOTVALID;
-        }
-
-        int numbins[2];
-        numbins[0] = ceil((T)nf1 / bin_size_x);
-        numbins[1] = ceil((T)nf2 / bin_size_y);
-
-        T *d_kx = d_plan->kx;
-        T *d_ky = d_plan->ky;
-
-        int *d_binsize = d_plan->binsize;
-        int *d_binstartpts = d_plan->binstartpts;
-        int *d_sortidx = d_plan->sortidx;
-        int *d_idxnupts = d_plan->idxnupts;
-
-        int ier;
-        if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream))))
-            return ier;
-
-        calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky, d_sortidx);
-        RETURN_IF_CUDA_ERROR
-
-        int n = numbins[0] * numbins[1];
-        thrust::device_ptr<int> d_ptr(d_binsize);
-        thrust::device_ptr<int> d_result(d_binstartpts);
-        thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-        calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, d_ky, d_idxnupts, nf1, nf2);
-        RETURN_IF_CUDA_ERROR
-    } else {
-        int *d_idxnupts = d_plan->idxnupts;
-
-        trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts);
-        RETURN_IF_CUDA_ERROR
-    }
-
-    return 0;
-}
-
-template <typename T>
-int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    int *d_idxnupts = d_plan->idxnupts;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->spopts.upsampfac;
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 16;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            spread_2d_nupts_driven<T, 1>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2,
-                                                         es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_2d_nupts_driven<T, 0>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2,
-                                                         es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    }
+  auto &stream = d_plan->stream;
 
-    return 0;
-}
-
-template <typename T>
-int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan)
-/*
-    This function determines the properties for spreading that are independent
-    of the strength of the nodes,  only relates to the locations of the nodes,
-    which only needs to be done once.
-*/
-{
-    auto &stream = d_plan->stream;
-
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  if (d_plan->opts.gpu_sort) {
     int bin_size_x = d_plan->opts.gpu_binsizex;
     int bin_size_y = d_plan->opts.gpu_binsizey;
     if (bin_size_x < 0 || bin_size_y < 0) {
-        std::cerr << "[cuspread2d_subprob_prop] error: invalid binsize (binsizex, binsizey) = (";
-        std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
+      std::cerr << "[cuspread2d_nuptsdriven_prop] error: invalid binsize (binsizex, "
+                   "binsizey) = (";
+      std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
+      return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
+
     int numbins[2];
     numbins[0] = ceil((T)nf1 / bin_size_x);
     numbins[1] = ceil((T)nf2 / bin_size_y);
@@ -167,21 +72,19 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
     T *d_kx = d_plan->kx;
     T *d_ky = d_plan->ky;
 
-    int *d_binsize = d_plan->binsize;
+    int *d_binsize     = d_plan->binsize;
     int *d_binstartpts = d_plan->binstartpts;
-    int *d_sortidx = d_plan->sortidx;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
-
-    int *d_subprob_to_bin = NULL;
+    int *d_sortidx     = d_plan->sortidx;
+    int *d_idxnupts    = d_plan->idxnupts;
 
     int ier;
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream))))
-        return ier;
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream))))
+      return ier;
 
     calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky, d_sortidx);
+        M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx,
+        d_ky, d_sortidx);
     RETURN_IF_CUDA_ERROR
 
     int n = numbins[0] * numbins[1];
@@ -190,110 +93,226 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
     thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
     calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, d_ky, d_idxnupts,
-        nf1, nf2);
-    RETURN_IF_CUDA_ERROR
-    calc_subprob_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize,
-                                                                numbins[0] * numbins[1]);
+        M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx,
+        d_ky, d_idxnupts, nf1, nf2);
     RETURN_IF_CUDA_ERROR
+  } else {
+    int *d_idxnupts = d_plan->idxnupts;
 
-    d_ptr = thrust::device_pointer_cast(d_numsubprob);
-    d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
-        return ier;
-
-    int totalnumsubprob;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream))))
-        return ier;
-    cudaStreamSynchronize(stream);
-    if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
-        return ier;
-    map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_subprob_to_bin);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
-
-    assert(d_subprob_to_bin != NULL);
-    cudaFreeAsync(d_plan->subprob_to_bin, stream);
-    d_plan->subprob_to_bin = d_subprob_to_bin;
-    d_plan->totalnumsubprob = totalnumsubprob;
+    trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
+                                                                             d_idxnupts);
+    RETURN_IF_CUDA_ERROR
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int numbins[2];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
+template<typename T>
+int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int ns          = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  int *d_idxnupts = d_plan->idxnupts;
+  T es_c          = d_plan->spopts.ES_c;
+  T es_beta       = d_plan->spopts.ES_beta;
+  T sigma         = d_plan->spopts.upsampfac;
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  threadsPerBlock.x = 16;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      spread_2d_nupts_driven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      spread_2d_nupts_driven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
 
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
+  return 0;
+}
 
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+template<typename T>
+int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan)
+/*
+    This function determines the properties for spreading that are independent
+    of the strength of the nodes,  only relates to the locations of the nodes,
+    which only needs to be done once.
+*/
+{
+  auto &stream = d_plan->stream;
+
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int bin_size_x     = d_plan->opts.gpu_binsizex;
+  int bin_size_y     = d_plan->opts.gpu_binsizey;
+  if (bin_size_x < 0 || bin_size_y < 0) {
+    std::cerr
+        << "[cuspread2d_subprob_prop] error: invalid binsize (binsizex, binsizey) = (";
+    std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
+  int numbins[2];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+
+  T *d_kx = d_plan->kx;
+  T *d_ky = d_plan->ky;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_sortidx         = d_plan->sortidx;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int *d_subprob_to_bin = NULL;
+
+  int ier;
+  if ((ier = checkCudaErrors(
+           cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream))))
+    return ier;
 
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
+  calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky,
+      d_sortidx);
+  RETURN_IF_CUDA_ERROR
+
+  int n = numbins[0] * numbins[1];
+  thrust::device_ptr<int> d_ptr(d_binsize);
+  thrust::device_ptr<int> d_result(d_binstartpts);
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+
+  calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx,
+      d_ky, d_idxnupts, nf1, nf2);
+  RETURN_IF_CUDA_ERROR
+  calc_subprob_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_binsize, d_numsubprob, maxsubprobsize, numbins[0] * numbins[1]);
+  RETURN_IF_CUDA_ERROR
+
+  d_ptr    = thrust::device_pointer_cast(d_numsubprob);
+  d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+
+  if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
+    return ier;
 
-    T sigma = d_plan->opts.upsampfac;
+  int totalnumsubprob;
+  if ((ier =
+           checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
+                                           sizeof(int), cudaMemcpyDeviceToHost, stream))))
+    return ier;
+  cudaStreamSynchronize(stream);
+  if ((ier = checkCudaErrors(
+           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+    return ier;
+  map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_subprob_to_bin);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  assert(d_subprob_to_bin != NULL);
+  cudaFreeAsync(d_plan->subprob_to_bin, stream);
+  d_plan->subprob_to_bin  = d_subprob_to_bin;
+  d_plan->totalnumsubprob = totalnumsubprob;
+
+  return 0;
+}
 
-    size_t sharedplanorysize =
-        (bin_size_x + 2 * (int)ceil(ns / 2.0)) * (bin_size_y + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+template<typename T>
+int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int numbins[2];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int totalnumsubprob   = d_plan->totalnumsubprob;
+  int *d_subprob_to_bin = d_plan->subprob_to_bin;
+
+  T sigma = d_plan->opts.upsampfac;
+
+  size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) *
+                             (bin_size_y + 2 * (int)ceil(ns / 2.0)) *
+                             sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
+          d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1],
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
-                numbins[0], numbins[1], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
-                numbins[0], numbins[1], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
+          d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1],
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuspread2d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread2d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cuspread2d_subprob_prop<float>(int nf1, int nf2, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread2d_subprob_prop<double>(int nf1, int nf2, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread2d_nuptsdriven_prop<float>(int nf1, int nf2, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread2d_nuptsdriven_prop<double>(int nf1, int nf2, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread2d_subprob_prop<float>(int nf1, int nf2, int M,
+                                            cufinufft_plan_t<float> *d_plan);
+template int cuspread2d_subprob_prop<double>(int nf1, int nf2, int M,
+                                             cufinufft_plan_t<double> *d_plan);
+template int cuspread2d_nuptsdriven_prop<float>(int nf1, int nf2, int M,
+                                                cufinufft_plan_t<float> *d_plan);
+template int cuspread2d_nuptsdriven_prop<double>(int nf1, int nf2, int M,
+                                                 cufinufft_plan_t<double> *d_plan);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index fa02ef860..41d69b03f 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -15,8 +15,9 @@ using namespace cufinufft::deconvolve;
 using namespace cufinufft::spreadinterp;
 using std::min;
 
-template <typename T>
-int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     3D Type-1 NUFFT
 
@@ -30,42 +31,43 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    auto &stream = d_plan->stream;
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
-
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        if ((ier = checkCudaErrors(cudaMemsetAsync(
-                 d_plan->fw, 0,
-                 d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * d_plan->nf3 * sizeof(cuda_complex<T>), stream))))
-            return ier;
-
-        // Step 1: Spread
-        if ((ier = cuspread3d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
-            return ier;
-    }
-
-    return 0;
+  auto &stream = d_plan->stream;
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
+
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    if ((ier = checkCudaErrors(
+             cudaMemsetAsync(d_plan->fw, 0,
+                             d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 *
+                                 d_plan->nf3 * sizeof(cuda_complex<T>),
+                             stream))))
+      return ier;
+
+    // Step 1: Spread
+    if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cudeconvolve3d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
-template <typename T>
-int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     3D Type-2 NUFFT
 
@@ -79,41 +81,41 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
-
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        RETURN_IF_CUDA_ERROR
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cuinterp3d<T>(d_plan, blksize)))
-            return ier;
-    }
-
-    return 0;
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
+
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+    if ((ier = cudeconvolve3d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    RETURN_IF_CUDA_ERROR
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
 template int cufinufft3d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft3d1_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft3d1_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
 
 template int cufinufft3d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft3d2_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft3d2_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu
index 9cdceccd0..b42231d86 100644
--- a/src/cuda/3d/interp3d_wrapper.cu
+++ b/src/cuda/3d/interp3d_wrapper.cu
@@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different interpolation methods.
@@ -26,141 +26,147 @@ int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuinterp3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuinterp3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuinterp3d] error: incorrect method, should be 1,2\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int nf3 = d_plan->nf3;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuinterp3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuinterp3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuinterp3d] error: incorrect method, should be 1,2\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
-int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->spopts.upsampfac;
-
-    int *d_idxnupts = d_plan->idxnupts;
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 16;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            interp_3d_nupts_driven<T, 1>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M,
-                                                         ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            interp_3d_nupts_driven<T, 0>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M,
-                                                         ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    }
+template<typename T>
+int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
 
-    return 0;
-}
+  dim3 threadsPerBlock;
+  dim3 blocks;
 
-template <typename T>
-int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int bin_size_z = d_plan->opts.gpu_binsizez;
-    int numbins[3];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-    numbins[2] = ceil((T)nf3 / bin_size_z);
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
-    int totalnumsubprob = d_plan->totalnumsubprob;
-
-    T sigma = d_plan->spopts.upsampfac;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) *
-                               (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n";
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-    }
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  T sigma   = d_plan->spopts.upsampfac;
+
+  int *d_idxnupts = d_plan->idxnupts;
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
 
+  threadsPerBlock.x = 16;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      interp_3d_nupts_driven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  } else {
     for (int t = 0; t < blksize; t++) {
-        if (d_plan->opts.gpu_kerevalmeth == 1) {
-            interp_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
-                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
-                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        } else {
-            interp_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
-                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
-                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+      interp_3d_nupts_driven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
+
+  return 0;
+}
+
+template<typename T>
+int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int bin_size_z = d_plan->opts.gpu_binsizez;
+  int numbins[3];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+  numbins[2] = ceil((T)nf3 / bin_size_z);
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+  int *d_subprob_to_bin  = d_plan->subprob_to_bin;
+  int totalnumsubprob    = d_plan->totalnumsubprob;
+
+  T sigma                  = d_plan->spopts.upsampfac;
+  T es_c                   = d_plan->spopts.ES_c;
+  T es_beta                = d_plan->spopts.ES_beta;
+  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
+                             (bin_size_y + 2 * ceil(ns / 2.0)) *
+                             (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n";
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  for (int t = 0; t < blksize; t++) {
+    if (d_plan->opts.gpu_kerevalmeth == 1) {
+      interp_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
+          bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+          numbins[0], numbins[1], numbins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    } else {
+      interp_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
+          bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+          numbins[0], numbins[1], numbins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuinterp3d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuinterp3d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
 
-template int cuinterp3d_nuptsdriven<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan,
-                                           int blksize);
-template int cuinterp3d_nuptsdriven<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan,
-                                            int blksize);
+template int cuinterp3d_nuptsdriven<float>(int nf1, int nf2, int nf3, int M,
+                                           cufinufft_plan_t<float> *d_plan, int blksize);
+template int cuinterp3d_nuptsdriven<double>(
+    int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan, int blksize);
 
-template int cuinterp3d_subprob<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan, int blksize);
-template int cuinterp3d_subprob<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan,
-                                        int blksize);
+template int cuinterp3d_subprob<float>(int nf1, int nf2, int nf3, int M,
+                                       cufinufft_plan_t<float> *d_plan, int blksize);
+template int cuinterp3d_subprob<double>(int nf1, int nf2, int nf3, int M,
+                                        cufinufft_plan_t<double> *d_plan, int blksize);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu
index 13d435e28..fb5ab0495 100644
--- a/src/cuda/3d/spread3d_wrapper.cu
+++ b/src/cuda/3d/spread3d_wrapper.cu
@@ -18,7 +18,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different spreading methods.
@@ -31,521 +31,551 @@ int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int M = d_plan->M;
-
-    int ier = 0;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuspread3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuspread3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    case 4: {
-        ier = cuspread3d_blockgather<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuspread3d] error: incorrect method, should be 1,2,4" << std::endl;
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
-}
-
-template <typename T>
-int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
-
-    if (d_plan->opts.gpu_sort) {
-        int bin_size_x = d_plan->opts.gpu_binsizex;
-        int bin_size_y = d_plan->opts.gpu_binsizey;
-        int bin_size_z = d_plan->opts.gpu_binsizez;
-        if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
-            std::cerr << "[cuspread3d_nuptsdriven_prop] error: invalid binsize (binsizex, binsizey, binsizez) = (";
-            std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
-            return FINUFFT_ERR_BINSIZE_NOTVALID;
-        }
-
-        int numbins[3];
-        numbins[0] = ceil((T)nf1 / bin_size_x);
-        numbins[1] = ceil((T)nf2 / bin_size_y);
-        numbins[2] = ceil((T)nf3 / bin_size_z);
-
-        T *d_kx = d_plan->kx;
-        T *d_ky = d_plan->ky;
-        T *d_kz = d_plan->kz;
-
-        int *d_binsize = d_plan->binsize;
-        int *d_binstartpts = d_plan->binstartpts;
-        int *d_sortidx = d_plan->sortidx;
-        int *d_idxnupts = d_plan->idxnupts;
-
-        int ier;
-        if ((ier = checkCudaErrors(
-                 cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
-            return ier;
-        calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binsize, d_kx,
-            d_ky, d_kz, d_sortidx);
-        RETURN_IF_CUDA_ERROR
-
-        int n = numbins[0] * numbins[1] * numbins[2];
-        thrust::device_ptr<int> d_ptr(d_binsize);
-        thrust::device_ptr<int> d_result(d_binstartpts);
-        thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-        calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binstartpts, d_sortidx, d_kx,
-            d_ky, d_kz, d_idxnupts, nf1, nf2, nf3);
-        RETURN_IF_CUDA_ERROR
-    } else {
-        int *d_idxnupts = d_plan->idxnupts;
-
-        trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts);
-        RETURN_IF_CUDA_ERROR
-    }
-
-    return 0;
-}
-
-template <typename T>
-int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T sigma = d_plan->spopts.upsampfac;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-
-    int *d_idxnupts = d_plan->idxnupts;
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 16;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth == 1) {
-        for (int t = 0; t < blksize; t++) {
-            spread_3d_nupts_driven<T, 1>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M,
-                                                         ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_3d_nupts_driven<T, 0>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M,
-                                                         ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    }
-
-    return 0;
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int nf3 = d_plan->nf3;
+  int M   = d_plan->M;
+
+  int ier = 0;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuspread3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuspread3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  case 4: {
+    ier = cuspread3d_blockgather<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuspread3d] error: incorrect method, should be 1,2,4" << std::endl;
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
-int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
-
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-    int o_bin_size_x = d_plan->opts.gpu_obinsizex;
-    int o_bin_size_y = d_plan->opts.gpu_obinsizey;
-    int o_bin_size_z = d_plan->opts.gpu_obinsizez;
-
-    int numobins[3];
-    if (nf1 % o_bin_size_x != 0 || nf2 % o_bin_size_y != 0 || nf3 % o_bin_size_z != 0) {
-        std::cerr << "[cuspread3d_blockgather_prop] error:\n";
-        std::cerr << "       mod(nf(1|2|3), opts.gpu_obinsize(x|y|z)) != 0" << std::endl;
-        std::cerr << "       (nf1, nf2, nf3) = (" << nf1 << ", " << nf2 << ", " << nf3 << ")" << std::endl;
-        std::cerr << "       (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", "
-                  << o_bin_size_z << ")" << std::endl;
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
-    }
-
-    numobins[0] = ceil((T)nf1 / o_bin_size_x);
-    numobins[1] = ceil((T)nf2 / o_bin_size_y);
-    numobins[2] = ceil((T)nf3 / o_bin_size_z);
+template<typename T>
+int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
+                                cufinufft_plan_t<T> *d_plan) {
+  auto &stream = d_plan->stream;
 
+  if (d_plan->opts.gpu_sort) {
     int bin_size_x = d_plan->opts.gpu_binsizex;
     int bin_size_y = d_plan->opts.gpu_binsizey;
     int bin_size_z = d_plan->opts.gpu_binsizez;
-    if (o_bin_size_x % bin_size_x != 0 || o_bin_size_y % bin_size_y != 0 || o_bin_size_z % bin_size_z != 0) {
-        std::cerr << "[cuspread3d_blockgather_prop] error:\n";
-        std::cerr << "      mod(ops.gpu_obinsize(x|y|z), opts.gpu_binsize(x|y|z)) != 0" << std::endl;
-        std::cerr << "      (binsizex, binsizey, binsizez) = (" << bin_size_x << ", " << bin_size_y << ", "
-                  << bin_size_z << ")" << std::endl;
-        std::cerr << "      (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", "
-                  << o_bin_size_z << ")" << std::endl;
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
+    if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
+      std::cerr << "[cuspread3d_nuptsdriven_prop] error: invalid binsize (binsizex, "
+                   "binsizey, binsizez) = (";
+      std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")"
+                << std::endl;
+      return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
 
-    int binsperobinx, binsperobiny, binsperobinz;
     int numbins[3];
-    binsperobinx = o_bin_size_x / bin_size_x + 2;
-    binsperobiny = o_bin_size_y / bin_size_y + 2;
-    binsperobinz = o_bin_size_z / bin_size_z + 2;
-    numbins[0] = numobins[0] * (binsperobinx);
-    numbins[1] = numobins[1] * (binsperobiny);
-    numbins[2] = numobins[2] * (binsperobinz);
+    numbins[0] = ceil((T)nf1 / bin_size_x);
+    numbins[1] = ceil((T)nf2 / bin_size_y);
+    numbins[2] = ceil((T)nf3 / bin_size_z);
 
     T *d_kx = d_plan->kx;
     T *d_ky = d_plan->ky;
     T *d_kz = d_plan->kz;
 
-    int *d_binsize = d_plan->binsize;
-    int *d_sortidx = d_plan->sortidx;
+    int *d_binsize     = d_plan->binsize;
     int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_idxnupts = NULL;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_subprob_to_bin = NULL;
+    int *d_sortidx     = d_plan->sortidx;
+    int *d_idxnupts    = d_plan->idxnupts;
 
     int ier;
-    if ((ier = checkCudaErrors(
-             cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
-        return ier;
-
-    locate_nupts_to_bins_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], binsperobinx, binsperobiny,
-        binsperobinz, d_binsize, d_kx, d_ky, d_kz, d_sortidx, nf1, nf2, nf3);
-    RETURN_IF_CUDA_ERROR
-
-    threadsPerBlock.x = 8;
-    threadsPerBlock.y = 8;
-    threadsPerBlock.z = 8;
-
-    blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
-    blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
-    blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
-
-    fill_ghost_bins<<<blocks, threadsPerBlock, 0, stream>>>(binsperobinx, binsperobiny, binsperobinz, numobins[0],
-                                                            numobins[1], numobins[2], d_binsize);
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
+      return ier;
+    calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+        M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1],
+        numbins[2], d_binsize, d_kx, d_ky, d_kz, d_sortidx);
     RETURN_IF_CUDA_ERROR
 
     int n = numbins[0] * numbins[1] * numbins[2];
     thrust::device_ptr<int> d_ptr(d_binsize);
-    thrust::device_ptr<int> d_result(d_binstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_binstartpts, 0, sizeof(int), stream))))
-        return ier;
-
-    int totalNUpts;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(&totalNUpts, &d_binstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream))))
-        return ier;
-    cudaStreamSynchronize(stream);
-    if ((ier = checkCudaErrors(cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream))))
-        return ier;
-
-    calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], binsperobinx, binsperobiny,
-        binsperobinz, d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_idxnupts);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
-
-    threadsPerBlock.x = 2;
-    threadsPerBlock.y = 2;
-    threadsPerBlock.z = 2;
-
-    blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
-    blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
-    blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
-
-    ghost_bin_pts_index<<<blocks, threadsPerBlock, 0, stream>>>(binsperobinx, binsperobiny, binsperobinz, numobins[0],
-                                                                numobins[1], numobins[2], d_binsize, d_idxnupts,
-                                                                d_binstartpts, M);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_idxnupts);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
-
-    cudaFree(d_plan->idxnupts);
-    d_plan->idxnupts = d_idxnupts;
+    thrust::device_ptr<int> d_result(d_binstartpts);
+    thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
-    /* --------------------------------------------- */
-    //        Determining Subproblem properties      //
-    /* --------------------------------------------- */
-    n = numobins[0] * numobins[1] * numobins[2];
-    calc_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(binsperobinx, binsperobiny, binsperobinz, d_binsize,
-                                                                   d_numsubprob, maxsubprobsize,
-                                                                   numobins[0] * numobins[1] * numobins[2]);
+    calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+        M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2],
+        d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3);
     RETURN_IF_CUDA_ERROR
+  } else {
+    int *d_idxnupts = d_plan->idxnupts;
 
-    n = numobins[0] * numobins[1] * numobins[2];
-    d_ptr = thrust::device_pointer_cast(d_numsubprob);
-    d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
-        return ier;
-
-    int totalnumsubprob;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream))))
-        return ier;
-    cudaStreamSynchronize(stream);
-    if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
-        return ier;
-    map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(d_subprob_to_bin, d_subprobstartpts,
-                                                                         d_numsubprob, n);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_subprob_to_bin);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
-
-    assert(d_subprob_to_bin != NULL);
-    cudaFree(d_plan->subprob_to_bin);
-    d_plan->subprob_to_bin = d_subprob_to_bin;
-    d_plan->totalnumsubprob = totalnumsubprob;
+    trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
+                                                                             d_idxnupts);
+    RETURN_IF_CUDA_ERROR
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->spopts.upsampfac;
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    int obin_size_x = d_plan->opts.gpu_obinsizex;
-    int obin_size_y = d_plan->opts.gpu_obinsizey;
-    int obin_size_z = d_plan->opts.gpu_obinsizez;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int bin_size_z = d_plan->opts.gpu_binsizez;
-    int numobins[3];
-    numobins[0] = ceil((T)nf1 / obin_size_x);
-    numobins[1] = ceil((T)nf2 / obin_size_y);
-    numobins[2] = ceil((T)nf3 / obin_size_z);
+template<typename T>
+int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
 
-    int binsperobinx, binsperobiny, binsperobinz;
-    binsperobinx = obin_size_x / bin_size_x + 2;
-    binsperobiny = obin_size_y / bin_size_y + 2;
-    binsperobinz = obin_size_z / bin_size_z + 2;
+  dim3 threadsPerBlock;
+  dim3 blocks;
 
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T sigma   = d_plan->spopts.upsampfac;
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
 
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+  int *d_idxnupts       = d_plan->idxnupts;
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
 
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
+  threadsPerBlock.x = 16;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
 
-    size_t sharedplanorysize = obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuspread3d_blockgather] error: not enough shared memory" << std::endl;
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  if (d_plan->opts.gpu_kerevalmeth == 1) {
+    for (int t = 0; t < blksize; t++) {
+      spread_3d_nupts_driven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
-
+  } else {
     for (int t = 0; t < blksize; t++) {
-        if (d_plan->opts.gpu_kerevalmeth == 1) {
-            spread_3d_block_gather<T, 1><<<totalnumsubprob, 64, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
-                d_binstartpts, obin_size_x, obin_size_y, obin_size_z, binsperobinx * binsperobiny * binsperobinz,
-                d_subprob_to_bin, d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        } else {
-            spread_3d_block_gather<T, 0><<<totalnumsubprob, 64, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
-                d_binstartpts, obin_size_x, obin_size_y, obin_size_z, binsperobinx * binsperobiny * binsperobinz,
-                d_subprob_to_bin, d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+      spread_3d_nupts_driven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
+template<typename T>
+int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
+                                cufinufft_plan_t<T> *d_plan) {
+  auto &stream = d_plan->stream;
+
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int o_bin_size_x   = d_plan->opts.gpu_obinsizex;
+  int o_bin_size_y   = d_plan->opts.gpu_obinsizey;
+  int o_bin_size_z   = d_plan->opts.gpu_obinsizez;
+
+  int numobins[3];
+  if (nf1 % o_bin_size_x != 0 || nf2 % o_bin_size_y != 0 || nf3 % o_bin_size_z != 0) {
+    std::cerr << "[cuspread3d_blockgather_prop] error:\n";
+    std::cerr << "       mod(nf(1|2|3), opts.gpu_obinsize(x|y|z)) != 0" << std::endl;
+    std::cerr << "       (nf1, nf2, nf3) = (" << nf1 << ", " << nf2 << ", " << nf3 << ")"
+              << std::endl;
+    std::cerr << "       (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", "
+              << o_bin_size_y << ", " << o_bin_size_z << ")" << std::endl;
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
+
+  numobins[0] = ceil((T)nf1 / o_bin_size_x);
+  numobins[1] = ceil((T)nf2 / o_bin_size_y);
+  numobins[2] = ceil((T)nf3 / o_bin_size_z);
+
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int bin_size_z = d_plan->opts.gpu_binsizez;
+  if (o_bin_size_x % bin_size_x != 0 || o_bin_size_y % bin_size_y != 0 ||
+      o_bin_size_z % bin_size_z != 0) {
+    std::cerr << "[cuspread3d_blockgather_prop] error:\n";
+    std::cerr << "      mod(ops.gpu_obinsize(x|y|z), opts.gpu_binsize(x|y|z)) != 0"
+              << std::endl;
+    std::cerr << "      (binsizex, binsizey, binsizez) = (" << bin_size_x << ", "
+              << bin_size_y << ", " << bin_size_z << ")" << std::endl;
+    std::cerr << "      (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", "
+              << o_bin_size_y << ", " << o_bin_size_z << ")" << std::endl;
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
+
+  int binsperobinx, binsperobiny, binsperobinz;
+  int numbins[3];
+  binsperobinx = o_bin_size_x / bin_size_x + 2;
+  binsperobiny = o_bin_size_y / bin_size_y + 2;
+  binsperobinz = o_bin_size_z / bin_size_z + 2;
+  numbins[0]   = numobins[0] * (binsperobinx);
+  numbins[1]   = numobins[1] * (binsperobiny);
+  numbins[2]   = numobins[2] * (binsperobinz);
+
+  T *d_kx = d_plan->kx;
+  T *d_ky = d_plan->ky;
+  T *d_kz = d_plan->kz;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_sortidx         = d_plan->sortidx;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_idxnupts        = NULL;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_subprob_to_bin  = NULL;
+
+  int ier;
+  if ((ier = checkCudaErrors(cudaMemsetAsync(
+           d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
+    return ier;
 
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int bin_size_z = d_plan->opts.gpu_binsizez;
-    if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
-        std::cerr << "error: invalid binsize (binsizex, binsizey, binsizez) = (";
-        std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
-    }
+  locate_nupts_to_bins_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2],
+      binsperobinx, binsperobiny, binsperobinz, d_binsize, d_kx, d_ky, d_kz, d_sortidx,
+      nf1, nf2, nf3);
+  RETURN_IF_CUDA_ERROR
 
-    int numbins[3];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-    numbins[2] = ceil((T)nf3 / bin_size_z);
+  threadsPerBlock.x = 8;
+  threadsPerBlock.y = 8;
+  threadsPerBlock.z = 8;
 
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
+  blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
+  blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
+  blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
 
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_sortidx = d_plan->sortidx;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+  fill_ghost_bins<<<blocks, threadsPerBlock, 0, stream>>>(
+      binsperobinx, binsperobiny, binsperobinz, numobins[0], numobins[1], numobins[2],
+      d_binsize);
+  RETURN_IF_CUDA_ERROR
 
-    int *d_subprob_to_bin = NULL;
+  int n = numbins[0] * numbins[1] * numbins[2];
+  thrust::device_ptr<int> d_ptr(d_binsize);
+  thrust::device_ptr<int> d_result(d_binstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
-    int ier;
-    if ((ier = checkCudaErrors(
-             cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
-        return ier;
-    calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binsize, d_kx, d_ky,
-        d_kz, d_sortidx);
-    RETURN_IF_CUDA_ERROR
-
-    int n = numbins[0] * numbins[1] * numbins[2];
-    thrust::device_ptr<int> d_ptr(d_binsize);
-    thrust::device_ptr<int> d_result(d_binstartpts);
-    thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binstartpts, d_sortidx, d_kx, d_ky,
-        d_kz, d_idxnupts, nf1, nf2, nf3);
-    RETURN_IF_CUDA_ERROR
-    /* --------------------------------------------- */
-    //        Determining Subproblem properties      //
-    /* --------------------------------------------- */
-    calc_subprob_3d_v2<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize,
-                                                                   numbins[0] * numbins[1] * numbins[2]);
-    RETURN_IF_CUDA_ERROR
+  if ((ier = checkCudaErrors(cudaMemsetAsync(d_binstartpts, 0, sizeof(int), stream))))
+    return ier;
 
-    d_ptr = thrust::device_pointer_cast(d_numsubprob);
-    d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-    int totalnumsubprob;
-    if (checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)) ||
-        checkCudaErrors(
-            cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream)
-            )
-        )
-        return FINUFFT_ERR_CUDA_FAILURE;
-    cudaStreamSynchronize(stream);
-    if(checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))
-        return FINUFFT_ERR_CUDA_FAILURE;
-
-    map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1] * numbins[2]);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_subprob_to_bin);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
+  int totalNUpts;
+  if ((ier = checkCudaErrors(cudaMemcpyAsync(&totalNUpts, &d_binstartpts[n], sizeof(int),
+                                             cudaMemcpyDeviceToHost, stream))))
+    return ier;
+  cudaStreamSynchronize(stream);
+  if ((ier = checkCudaErrors(
+           cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream))))
+    return ier;
 
-    assert(d_subprob_to_bin != NULL);
-    if (d_plan->subprob_to_bin != NULL)
-        cudaFree(d_plan->subprob_to_bin);
-    d_plan->subprob_to_bin = d_subprob_to_bin;
-    assert(d_plan->subprob_to_bin != nullptr);
-    d_plan->totalnumsubprob = totalnumsubprob;
+  calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2],
+      binsperobinx, binsperobiny, binsperobinz, d_binstartpts, d_sortidx, d_kx, d_ky,
+      d_kz, d_idxnupts, nf1, nf2, nf3);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_idxnupts);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  threadsPerBlock.x = 2;
+  threadsPerBlock.y = 2;
+  threadsPerBlock.z = 2;
+
+  blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
+  blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
+  blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
+
+  ghost_bin_pts_index<<<blocks, threadsPerBlock, 0, stream>>>(
+      binsperobinx, binsperobiny, binsperobinz, numobins[0], numobins[1], numobins[2],
+      d_binsize, d_idxnupts, d_binstartpts, M);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_idxnupts);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  cudaFree(d_plan->idxnupts);
+  d_plan->idxnupts = d_idxnupts;
+
+  /* --------------------------------------------- */
+  //        Determining Subproblem properties      //
+  /* --------------------------------------------- */
+  n = numobins[0] * numobins[1] * numobins[2];
+  calc_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      binsperobinx, binsperobiny, binsperobinz, d_binsize, d_numsubprob, maxsubprobsize,
+      numobins[0] * numobins[1] * numobins[2]);
+  RETURN_IF_CUDA_ERROR
+
+  n        = numobins[0] * numobins[1] * numobins[2];
+  d_ptr    = thrust::device_pointer_cast(d_numsubprob);
+  d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+
+  if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
+    return ier;
 
-    return 0;
+  int totalnumsubprob;
+  if ((ier =
+           checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
+                                           sizeof(int), cudaMemcpyDeviceToHost, stream))))
+    return ier;
+  cudaStreamSynchronize(stream);
+  if ((ier = checkCudaErrors(
+           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+    return ier;
+  map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_subprob_to_bin, d_subprobstartpts, d_numsubprob, n);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_subprob_to_bin);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  assert(d_subprob_to_bin != NULL);
+  cudaFree(d_plan->subprob_to_bin);
+  d_plan->subprob_to_bin  = d_subprob_to_bin;
+  d_plan->totalnumsubprob = totalnumsubprob;
+
+  return 0;
 }
 
-template <typename T>
-int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int bin_size_z = d_plan->opts.gpu_binsizez;
-    int numbins[3];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-    numbins[2] = ceil((T)nf3 / bin_size_z);
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
+template<typename T>
+int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns             = d_plan->spopts.nspread;
+  T es_c             = d_plan->spopts.ES_c;
+  T es_beta          = d_plan->spopts.ES_beta;
+  T sigma            = d_plan->spopts.upsampfac;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  int obin_size_x = d_plan->opts.gpu_obinsizex;
+  int obin_size_y = d_plan->opts.gpu_obinsizey;
+  int obin_size_z = d_plan->opts.gpu_obinsizez;
+  int bin_size_x  = d_plan->opts.gpu_binsizex;
+  int bin_size_y  = d_plan->opts.gpu_binsizey;
+  int bin_size_z  = d_plan->opts.gpu_binsizez;
+  int numobins[3];
+  numobins[0] = ceil((T)nf1 / obin_size_x);
+  numobins[1] = ceil((T)nf2 / obin_size_y);
+  numobins[2] = ceil((T)nf3 / obin_size_z);
+
+  int binsperobinx, binsperobiny, binsperobinz;
+  binsperobinx = obin_size_x / bin_size_x + 2;
+  binsperobiny = obin_size_y / bin_size_y + 2;
+  binsperobinz = obin_size_z / bin_size_z + 2;
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int totalnumsubprob   = d_plan->totalnumsubprob;
+  int *d_subprob_to_bin = d_plan->subprob_to_bin;
+
+  size_t sharedplanorysize =
+      obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuspread3d_blockgather] error: not enough shared memory" << std::endl;
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  for (int t = 0; t < blksize; t++) {
+    if (d_plan->opts.gpu_kerevalmeth == 1) {
+      spread_3d_block_gather<T, 1><<<totalnumsubprob, 64, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z,
+          binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, d_subprobstartpts,
+          maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    } else {
+      spread_3d_block_gather<T, 0><<<totalnumsubprob, 64, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z,
+          binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, d_subprobstartpts,
+          maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
 
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+  return 0;
+}
 
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
-
-    T sigma = d_plan->spopts.upsampfac;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) *
-                               (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" << sharedplanorysize << ")" << std::endl;
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-    }
+template<typename T>
+int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
+                            cufinufft_plan_t<T> *d_plan) {
+  auto &stream = d_plan->stream;
+
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int bin_size_x     = d_plan->opts.gpu_binsizex;
+  int bin_size_y     = d_plan->opts.gpu_binsizey;
+  int bin_size_z     = d_plan->opts.gpu_binsizez;
+  if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
+    std::cerr << "error: invalid binsize (binsizex, binsizey, binsizez) = (";
+    std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
+
+  int numbins[3];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+  numbins[2] = ceil((T)nf3 / bin_size_z);
+
+  T *d_kx = d_plan->kx;
+  T *d_ky = d_plan->ky;
+  T *d_kz = d_plan->kz;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_sortidx         = d_plan->sortidx;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int *d_subprob_to_bin = NULL;
+
+  int ier;
+  if ((ier = checkCudaErrors(cudaMemsetAsync(
+           d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
+    return ier;
+  calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1],
+      numbins[2], d_binsize, d_kx, d_ky, d_kz, d_sortidx);
+  RETURN_IF_CUDA_ERROR
+
+  int n = numbins[0] * numbins[1] * numbins[2];
+  thrust::device_ptr<int> d_ptr(d_binsize);
+  thrust::device_ptr<int> d_result(d_binstartpts);
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+
+  calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2],
+      d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3);
+  RETURN_IF_CUDA_ERROR
+  /* --------------------------------------------- */
+  //        Determining Subproblem properties      //
+  /* --------------------------------------------- */
+  calc_subprob_3d_v2<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_binsize, d_numsubprob, maxsubprobsize, numbins[0] * numbins[1] * numbins[2]);
+  RETURN_IF_CUDA_ERROR
+
+  d_ptr    = thrust::device_pointer_cast(d_numsubprob);
+  d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+  int totalnumsubprob;
+  if (checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)) ||
+      checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
+                                      sizeof(int), cudaMemcpyDeviceToHost, stream)))
+    return FINUFFT_ERR_CUDA_FAILURE;
+  cudaStreamSynchronize(stream);
+  if (checkCudaErrors(
+          cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))
+    return FINUFFT_ERR_CUDA_FAILURE;
+
+  map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0,
+                             stream>>>(d_subprob_to_bin, d_subprobstartpts, d_numsubprob,
+                                       numbins[0] * numbins[1] * numbins[2]);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_subprob_to_bin);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  assert(d_subprob_to_bin != NULL);
+  if (d_plan->subprob_to_bin != NULL) cudaFree(d_plan->subprob_to_bin);
+  d_plan->subprob_to_bin = d_subprob_to_bin;
+  assert(d_plan->subprob_to_bin != nullptr);
+  d_plan->totalnumsubprob = totalnumsubprob;
+
+  return 0;
+}
 
-    for (int t = 0; t < blksize; t++) {
-        if (d_plan->opts.gpu_kerevalmeth) {
-            spread_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta,
-                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
-                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        } else {
-            spread_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta,
-                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
-                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+template<typename T>
+int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int bin_size_z = d_plan->opts.gpu_binsizez;
+  int numbins[3];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+  numbins[2] = ceil((T)nf3 / bin_size_z);
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int totalnumsubprob   = d_plan->totalnumsubprob;
+  int *d_subprob_to_bin = d_plan->subprob_to_bin;
+
+  T sigma                  = d_plan->spopts.upsampfac;
+  T es_c                   = d_plan->spopts.ES_c;
+  T es_beta                = d_plan->spopts.ES_beta;
+  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
+                             (bin_size_y + 2 * ceil(ns / 2.0)) *
+                             (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuspread3d_subprob] error: not enough shared memory ("
+              << sharedplanorysize << ")" << std::endl;
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  for (int t = 0; t < blksize; t++) {
+    if (d_plan->opts.gpu_kerevalmeth) {
+      spread_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
+          bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+          numbins[0], numbins[1], numbins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    } else {
+      spread_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
+          bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+          numbins[0], numbins[1], numbins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuspread3d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread3d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cuspread3d_nuptsdriven_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread3d_nuptsdriven_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread3d_subprob_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread3d_subprob_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread3d_blockgather_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread3d_blockgather_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread3d_nuptsdriven_prop<float>(int nf1, int nf2, int nf3, int M,
+                                                cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_nuptsdriven_prop<double>(int nf1, int nf2, int nf3, int M,
+                                                 cufinufft_plan_t<double> *d_plan);
+template int cuspread3d_subprob_prop<float>(int nf1, int nf2, int nf3, int M,
+                                            cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_subprob_prop<double>(int nf1, int nf2, int nf3, int M,
+                                             cufinufft_plan_t<double> *d_plan);
+template int cuspread3d_blockgather_prop<float>(int nf1, int nf2, int nf3, int M,
+                                                cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_blockgather_prop<double>(int nf1, int nf2, int nf3, int M,
+                                                 cufinufft_plan_t<double> *d_plan);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index a83688693..a87628a38 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -25,39 +25,42 @@ using std::max;
    cnufftspread's real symmetric kernel. */
 // a , f are intermediate results from function onedim_fseries_kernel_precomp()
 // (see cufinufft/contrib/common.cpp for description)
-template <typename T>
-__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, cuDoubleComplex *a, T *fwkerhalf1,
-                                       T *fwkerhalf2, T *fwkerhalf3, int ns) {
-    T J2 = ns / 2.0;
-    int q = (int)(2 + 3.0 * J2);
-    int nf;
-    cuDoubleComplex *at = a + threadIdx.y * MAX_NQUAD;
-    T *ft = f + threadIdx.y * MAX_NQUAD;
-    T *oarr;
-    if (threadIdx.y == 0) {
-        oarr = fwkerhalf1;
-        nf = nf1;
-    } else if (threadIdx.y == 1) {
-        oarr = fwkerhalf2;
-        nf = nf2;
-    } else {
-        oarr = fwkerhalf3;
-        nf = nf3;
-    }
-
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf / 2 + 1; i += blockDim.x * gridDim.x) {
-        int brk = 0.5 + i;
-        T x = 0.0;
-        for (int n = 0; n < q; n++) {
-            x += ft[n] * 2 * (pow(cabs(at[n]), brk) * cos(brk * carg(at[n])));
-        }
-        oarr[i] = x;
+template<typename T>
+__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
+                                       cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
+                                       T *fwkerhalf3, int ns) {
+  T J2  = ns / 2.0;
+  int q = (int)(2 + 3.0 * J2);
+  int nf;
+  cuDoubleComplex *at = a + threadIdx.y * MAX_NQUAD;
+  T *ft               = f + threadIdx.y * MAX_NQUAD;
+  T *oarr;
+  if (threadIdx.y == 0) {
+    oarr = fwkerhalf1;
+    nf   = nf1;
+  } else if (threadIdx.y == 1) {
+    oarr = fwkerhalf2;
+    nf   = nf2;
+  } else {
+    oarr = fwkerhalf3;
+    nf   = nf3;
+  }
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf / 2 + 1;
+       i += blockDim.x * gridDim.x) {
+    int brk = 0.5 + i;
+    T x     = 0.0;
+    for (int n = 0; n < q; n++) {
+      x += ft[n] * 2 * (pow(cabs(at[n]), brk) * cos(brk * carg(at[n])));
     }
+    oarr[i] = x;
+  }
 }
 
-template <typename T>
-int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleComplex *d_a, T *d_fwkerhalf1,
-                           T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream)
+template<typename T>
+int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
+                           cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
+                           T *d_fwkerhalf3, int ns, cudaStream_t stream)
 /*
     wrapper for approximation of Fourier series of real symmetric spreading
     kernel.
@@ -65,44 +68,43 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleC
     Melody Shih 2/20/22
 */
 {
-    int nout = max(max(nf1 / 2 + 1, nf2 / 2 + 1), nf3 / 2 + 1);
+  int nout = max(max(nf1 / 2 + 1, nf2 / 2 + 1), nf3 / 2 + 1);
 
-    dim3 threadsPerBlock(16, dim);
-    dim3 numBlocks((nout + 16 - 1) / 16, 1);
+  dim3 threadsPerBlock(16, dim);
+  dim3 numBlocks((nout + 16 - 1) / 16, 1);
 
-    fseries_kernel_compute<<<numBlocks, threadsPerBlock, 0, stream>>>(nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1,
-                                                                      d_fwkerhalf2, d_fwkerhalf3, ns);
-    RETURN_IF_CUDA_ERROR
+  fseries_kernel_compute<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
+  RETURN_IF_CUDA_ERROR
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
+template<typename T>
 int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts)
 // Set up the spreader parameters given eps, and pass across various nufft
 // options. Report status of setup_spreader.  Barnett 10/30/17
 {
-    int ier = setup_spreader(spopts, eps, (T)opts.upsampfac, opts.gpu_kerevalmeth);
-    return ier;
+  int ier = setup_spreader(spopts, eps, (T)opts.upsampfac, opts.gpu_kerevalmeth);
+  return ier;
 }
 
-void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, CUFINUFFT_BIGINT *nf,
-                   CUFINUFFT_BIGINT bs)
+void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
+                   CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT bs)
 // type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
 // and requested number of Fourier modes ms.
 {
-    *nf = (CUFINUFFT_BIGINT)(opts.upsampfac * ms);
-    if (*nf < 2 * spopts.nspread)
-        *nf = 2 * spopts.nspread; // otherwise spread fails
-    if (*nf < MAX_NF) {           // otherwise will fail anyway
-        if (opts.gpu_method == 4) // expensive at huge nf
-            *nf = utils::next235beven(*nf, bs);
-        else
-            *nf = utils::next235beven(*nf, 1);
-    }
+  *nf = (CUFINUFFT_BIGINT)(opts.upsampfac * ms);
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
+  if (*nf < MAX_NF) {                                     // otherwise will fail anyway
+    if (opts.gpu_method == 4)                             // expensive at huge nf
+      *nf = utils::next235beven(*nf, bs);
+    else
+      *nf = utils::next235beven(*nf, 1);
+  }
 }
 
-template <typename T>
+template<typename T>
 void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts)
 /*
   Approximates exact Fourier series coeffs of cnufftspread's real symmetric
@@ -129,10 +131,10 @@ void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opt
   Melody 2/20/22 separate into precomp & comp functions defined below.
  */
 {
-    T f[MAX_NQUAD];
-    std::complex<double> a[MAX_NQUAD];
-    onedim_fseries_kernel_precomp(nf, f, a, opts);
-    onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts);
+  T f[MAX_NQUAD];
+  std::complex<double> a[MAX_NQUAD];
+  onedim_fseries_kernel_precomp(nf, f, a, opts);
+  onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts);
 }
 
 /*
@@ -148,70 +150,81 @@ void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opt
   f - funciton values at quadrature nodes multiplied with quadrature weights
   (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below)
 */
-template <typename T>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, finufft_spread_opts opts) {
-    T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
-    // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-    int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
-    double z[2 * MAX_NQUAD];
-    double w[2 * MAX_NQUAD];
-
-    finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1)
-    for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
-        z[n] *= J2;                                         // rescale nodes
-        f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts);  // vals & quadr wei
-        a[n] = exp((T)(2.0 * M_PI) * std::complex<T>(0.0, 1.0) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates
-    }
+template<typename T>
+void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+                                   finufft_spread_opts opts) {
+  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
+  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
+  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
+  double z[2 * MAX_NQUAD];
+  double w[2 * MAX_NQUAD];
+
+  finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg
+                                                          // on (0,1)
+  for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
+    z[n] *= J2;                                           // rescale nodes
+    f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts);    // vals & quadr wei
+    a[n] = exp((T)(2.0 * M_PI) * std::complex<T>(0.0, 1.0) * (T)(nf / 2 - z[n]) /
+               (T)nf);                                    // phase winding rates
+  }
 }
 
-template <typename T>
-void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, T *fwkerhalf,
-                                   finufft_spread_opts opts) {
-    T J2 = opts.nspread / 2.0;                         // J/2, half-width of ker z-support
-    int q = (int)(2 + 3.0 * J2);                       // not sure why so large? cannot exceed MAX_NQUAD
-    CUFINUFFT_BIGINT nout = nf / 2 + 1;                // how many values we're writing to
-    int nt = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks
-    std::vector<CUFINUFFT_BIGINT> brk(nt + 1);         // start indices for each thread
-    for (int t = 0; t <= nt; ++t)                      // split nout mode indices btw threads
-        brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt);
+template<typename T>
+void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+                                   T *fwkerhalf, finufft_spread_opts opts) {
+  T J2  = opts.nspread / 2.0;         // J/2, half-width of ker z-support
+  int q = (int)(2 + 3.0 * J2);        // not sure why so large? cannot exceed MAX_NQUAD
+  CUFINUFFT_BIGINT nout = nf / 2 + 1; // how many values we're writing to
+  int nt                = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks
+  std::vector<CUFINUFFT_BIGINT> brk(nt + 1); // start indices for each thread
+  for (int t = 0; t <= nt; ++t)              // split nout mode indices btw threads
+    brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt);
 #pragma omp parallel
-    {
-        int t = MY_OMP_GET_THREAD_NUM();
-        if (t < nt) {                           // could be nt < actual # threads
-            std::complex<double> aj[MAX_NQUAD]; // phase rotator for this thread
-            for (int n = 0; n < q; ++n)
-                aj[n] = pow(a[n], (T)brk[t]);                        // init phase factors for chunk
-            for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
-                T x = 0.0;                                           // accumulator for answer at this j
-                for (int n = 0; n < q; ++n) {
-                    x += f[n] * 2 * real(aj[n]); // include the negative freq
-                    aj[n] *= a[n];               // wind the phases
-                }
-                fwkerhalf[j] = x;
-            }
+  {
+    int t = MY_OMP_GET_THREAD_NUM();
+    if (t < nt) {                         // could be nt < actual # threads
+      std::complex<double> aj[MAX_NQUAD]; // phase rotator for this thread
+      for (int n = 0; n < q; ++n)
+        aj[n] = pow(a[n], (T)brk[t]);     // init phase factors for chunk
+      for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
+        T x = 0.0;                     // accumulator for answer at this j
+        for (int n = 0; n < q; ++n) {
+          x += f[n] * 2 * real(aj[n]); // include the negative freq
+          aj[n] *= a[n];               // wind the phases
         }
+        fwkerhalf[j] = x;
+      }
     }
+  }
 }
 
-template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, std::complex<double> *a, float *fwkerhalf,
+template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f,
+                                            std::complex<double> *a, float *fwkerhalf,
                                             finufft_spread_opts opts);
-template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f, std::complex<double> *a, double *fwkerhalf,
+template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f,
+                                            std::complex<double> *a, double *fwkerhalf,
                                             finufft_spread_opts opts);
 
-template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, cufinufft_opts opts);
-template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, cufinufft_opts opts);
-template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, float *f, std::complex<double> *a,
-                                            finufft_spread_opts opts);
-template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, double *f, std::complex<double> *a,
-                                            finufft_spread_opts opts);
-template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f, cuDoubleComplex *d_a,
-                                    float *d_fwkerhalf1, float *d_fwkerhalf2, float *d_fwkerhalf3, int ns,
+template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps,
+                                      cufinufft_opts opts);
+template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps,
+                                      cufinufft_opts opts);
+template void onedim_fseries_kernel_precomp(
+    CUFINUFFT_BIGINT nf, float *f, std::complex<double> *a, finufft_spread_opts opts);
+template void onedim_fseries_kernel_precomp(
+    CUFINUFFT_BIGINT nf, double *f, std::complex<double> *a, finufft_spread_opts opts);
+template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f,
+                                    cuDoubleComplex *d_a, float *d_fwkerhalf1,
+                                    float *d_fwkerhalf2, float *d_fwkerhalf3, int ns,
                                     cudaStream_t stream);
-template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, double *d_f, cuDoubleComplex *d_a,
-                                    double *d_fwkerhalf1, double *d_fwkerhalf2, double *d_fwkerhalf3, int ns,
+template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, double *d_f,
+                                    cuDoubleComplex *d_a, double *d_fwkerhalf1,
+                                    double *d_fwkerhalf2, double *d_fwkerhalf3, int ns,
                                     cudaStream_t stream);
 
-template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, finufft_spread_opts opts);
-template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, finufft_spread_opts opts);
+template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf,
+                                    finufft_spread_opts opts);
+template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf,
+                                    finufft_spread_opts opts);
 } // namespace common
 } // namespace cufinufft
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index 60cdd4482..a81e88780 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -7,76 +7,75 @@
 #include <cufinufft/impl.h>
 
 inline bool is_invalid_mode_array(int dim, const int64_t *modes64, int32_t modes32[3]) {
-    int64_t tot_size = 1;
-    for (int i = 0; i < dim; ++i) {
-        if (modes64[i] > std::numeric_limits<int32_t>::max())
-            return true;
-        if (modes64[i] <= 0)
-            return true;
-        modes32[i] = modes64[i];
-        tot_size *= modes64[i];
-    }
-    for (int i = dim; i < 3; ++i)
-        modes32[i] = 1;
-
-    return tot_size > std::numeric_limits<int32_t>::max();
+  int64_t tot_size = 1;
+  for (int i = 0; i < dim; ++i) {
+    if (modes64[i] > std::numeric_limits<int32_t>::max()) return true;
+    if (modes64[i] <= 0) return true;
+    modes32[i] = modes64[i];
+    tot_size *= modes64[i];
+  }
+  for (int i = dim; i < 3; ++i) modes32[i] = 1;
+
+  return tot_size > std::numeric_limits<int32_t>::max();
 }
 
 extern "C" {
-int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, float tol,
-                        cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts) {
-    if (dim < 1 || dim > 3) {
-        fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
-        return FINUFFT_ERR_DIM_NOTVALID;
-    }
-
-    int nmodes32[3];
-    if (is_invalid_mode_array(dim, nmodes, nmodes32))
-        return FINUFFT_ERR_NDATA_NOTVALID;
-
-    return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t<float> **)d_plan_ptr,
-                                   opts);
+int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf,
+                        float tol, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts) {
+  if (dim < 1 || dim > 3) {
+    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
+    return FINUFFT_ERR_DIM_NOTVALID;
+  }
+
+  int nmodes32[3];
+  if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID;
+
+  return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol,
+                                 (cufinufft_plan_t<float> **)d_plan_ptr, opts);
 }
 
-int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, double tol,
-                       cufinufft_plan *d_plan_ptr, cufinufft_opts *opts) {
-    if (dim < 1 || dim > 3) {
-        fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
-        return FINUFFT_ERR_DIM_NOTVALID;
-    }
+int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf,
+                       double tol, cufinufft_plan *d_plan_ptr, cufinufft_opts *opts) {
+  if (dim < 1 || dim > 3) {
+    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
+    return FINUFFT_ERR_DIM_NOTVALID;
+  }
 
-    int nmodes32[3];
-    if (is_invalid_mode_array(dim, nmodes, nmodes32))
-        return FINUFFT_ERR_NDATA_NOTVALID;
+  int nmodes32[3];
+  if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID;
 
-    return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t<double> **)d_plan_ptr,
-                                   opts);
+  return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol,
+                                 (cufinufft_plan_t<double> **)d_plan_ptr, opts);
 }
 
-int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s,
-                      float *d_t, float *d_u) {
-    return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t<float> *)d_plan);
+int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z,
+                      int N, float *d_s, float *d_t, float *d_u) {
+  return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u,
+                               (cufinufft_plan_t<float> *)d_plan);
 }
 
-int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s,
-                     double *d_t, double *d_u) {
-    return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t<double> *)d_plan);
+int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z,
+                     int N, double *d_s, double *d_t, double *d_u) {
+  return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u,
+                               (cufinufft_plan_t<double> *)d_plan);
 }
 
-int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk) {
-    return cufinufft_execute_impl<float>(d_c, d_fk, (cufinufft_plan_t<float> *)d_plan);
+int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c,
+                       cuFloatComplex *d_fk) {
+  return cufinufft_execute_impl<float>(d_c, d_fk, (cufinufft_plan_t<float> *)d_plan);
 }
 
-int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuda_complex<double> *d_fk) {
-    return cufinufft_execute_impl<double>(d_c, d_fk, (cufinufft_plan_t<double> *)d_plan);
+int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c,
+                      cuda_complex<double> *d_fk) {
+  return cufinufft_execute_impl<double>(d_c, d_fk, (cufinufft_plan_t<double> *)d_plan);
 }
 
 int cufinufftf_destroy(cufinufftf_plan d_plan) {
-    return cufinufft_destroy_impl<float>((cufinufft_plan_t<float> *)d_plan);
+  return cufinufft_destroy_impl<float>((cufinufft_plan_t<float> *)d_plan);
 }
 
 int cufinufft_destroy(cufinufft_plan d_plan) {
-    return cufinufft_destroy_impl<double>((cufinufft_plan_t<double> *)d_plan);
+  return cufinufft_destroy_impl<double>((cufinufft_plan_t<double> *)d_plan);
 }
 
 void cufinufft_default_opts(cufinufft_opts *opts)
@@ -96,30 +95,30 @@ void cufinufft_default_opts(cufinufft_opts *opts)
     Melody Shih 07/25/19; Barnett 2/5/21.
 */
 {
-    opts->upsampfac = 2.0;
+  opts->upsampfac = 2.0;
 
-    /* following options are for gpu */
-    opts->gpu_sort = 1; // access nupts in an ordered way for nupts driven method
+  /* following options are for gpu */
+  opts->gpu_sort = 1; // access nupts in an ordered way for nupts driven method
 
-    opts->gpu_maxsubprobsize = 1024;
-    opts->gpu_obinsizex = -1;
-    opts->gpu_obinsizey = -1;
-    opts->gpu_obinsizez = -1;
+  opts->gpu_maxsubprobsize = 1024;
+  opts->gpu_obinsizex      = -1;
+  opts->gpu_obinsizey      = -1;
+  opts->gpu_obinsizez      = -1;
 
-    opts->gpu_binsizex = -1;
-    opts->gpu_binsizey = -1;
-    opts->gpu_binsizez = -1;
+  opts->gpu_binsizex = -1;
+  opts->gpu_binsizey = -1;
+  opts->gpu_binsizez = -1;
 
-    opts->gpu_spreadinterponly = 0; // default to do the whole nufft
+  opts->gpu_spreadinterponly = 0; // default to do the whole nufft
 
-    opts->gpu_maxbatchsize = 0; // Heuristically set
-    opts->gpu_stream = cudaStreamDefault;
+  opts->gpu_maxbatchsize = 0;     // Heuristically set
+  opts->gpu_stream       = cudaStreamDefault;
 
-    opts->gpu_kerevalmeth = 1; // Horner
+  opts->gpu_kerevalmeth = 1; // Horner
 
-    opts->gpu_method = 0; // Auto method (2 for type 1, 2 for type 2).
+  opts->gpu_method = 0;      // Auto method (2 for type 1, 2 for type 2).
 
-    // By default, only use device 0
-    opts->gpu_device_id = 0;
+  // By default, only use device 0
+  opts->gpu_device_id = 0;
 }
 }
diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu
index ffb65b8da..efdd656c7 100644
--- a/src/cuda/deconvolve_wrapper.cu
+++ b/src/cuda/deconvolve_wrapper.cu
@@ -12,102 +12,114 @@ namespace deconvolve {
 /* Kernel for copying fw to fk with amplication by prefac/ker */
 // Note: assume modeord=0: CMCL-compatible mode ordering in fk (from -N/2 up
 // to N/2-1)
-template <typename T>
-__global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) {
-        int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2;
-
-        T kervalue = fwkerhalf1[abs(i - ms / 2)];
-        fk[i].x = fw[w1].x / kervalue;
-        fk[i].y = fw[w1].y / kervalue;
-    }
+template<typename T>
+__global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk,
+                              T *fwkerhalf1) {
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms;
+       i += blockDim.x * gridDim.x) {
+    int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2;
+
+    T kervalue = fwkerhalf1[abs(i - ms / 2)];
+    fk[i].x    = fw[w1].x / kervalue;
+    fk[i].y    = fw[w1].y / kervalue;
+  }
 }
 
-template <typename T>
-__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
-                              T *fwkerhalf2) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) {
-        int k1 = i % ms;
-        int k2 = i / ms;
-        int outidx = k1 + k2 * ms;
-        int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
-        int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
-        int inidx = w1 + w2 * nf1;
-
-        T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)];
-        fk[outidx].x = fw[inidx].x / kervalue;
-        fk[outidx].y = fw[inidx].y / kervalue;
-    }
+template<typename T>
+__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw,
+                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2) {
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt;
+       i += blockDim.x * gridDim.x) {
+    int k1     = i % ms;
+    int k2     = i / ms;
+    int outidx = k1 + k2 * ms;
+    int w1     = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
+    int w2     = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
+    int inidx  = w1 + w2 * nf1;
+
+    T kervalue   = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)];
+    fk[outidx].x = fw[inidx].x / kervalue;
+    fk[outidx].y = fw[inidx].y / kervalue;
+  }
 }
 
-template <typename T>
-__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw,
-                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) {
-        int k1 = i % ms;
-        int k2 = (i / ms) % mt;
-        int k3 = (i / ms / mt);
-        int outidx = k1 + k2 * ms + k3 * ms * mt;
-        int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
-        int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
-        int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2;
-        int inidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
-
-        T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * fwkerhalf3[abs(k3 - mu / 2)];
-        fk[outidx].x = fw[inidx].x / kervalue;
-        fk[outidx].y = fw[inidx].y / kervalue;
-    }
+template<typename T>
+__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
+                              cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
+                              T *fwkerhalf2, T *fwkerhalf3) {
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu;
+       i += blockDim.x * gridDim.x) {
+    int k1     = i % ms;
+    int k2     = (i / ms) % mt;
+    int k3     = (i / ms / mt);
+    int outidx = k1 + k2 * ms + k3 * ms * mt;
+    int w1     = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
+    int w2     = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
+    int w3     = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2;
+    int inidx  = w1 + w2 * nf1 + w3 * nf1 * nf2;
+
+    T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] *
+                 fwkerhalf3[abs(k3 - mu / 2)];
+    fk[outidx].x = fw[inidx].x / kervalue;
+    fk[outidx].y = fw[inidx].y / kervalue;
+  }
 }
 
 /* Kernel for copying fk to fw with same amplication */
-template <typename T>
-__global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) {
-        int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2;
-
-        T kervalue = fwkerhalf1[abs(i - ms / 2)];
-        fw[w1].x = fk[i].x / kervalue;
-        fw[w1].y = fk[i].y / kervalue;
-    }
+template<typename T>
+__global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk,
+                           T *fwkerhalf1) {
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms;
+       i += blockDim.x * gridDim.x) {
+    int w1 = i - ms / 2 >= 0 ? i - ms / 2 : nf1 + i - ms / 2;
+
+    T kervalue = fwkerhalf1[abs(i - ms / 2)];
+    fw[w1].x   = fk[i].x / kervalue;
+    fw[w1].y   = fk[i].y / kervalue;
+  }
 }
 
-template <typename T>
-__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
-                           T *fwkerhalf2) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) {
-        int k1 = i % ms;
-        int k2 = i / ms;
-        int inidx = k1 + k2 * ms;
-        int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
-        int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
-        int outidx = w1 + w2 * nf1;
-
-        T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)];
-        fw[outidx].x = fk[inidx].x / kervalue;
-        fw[outidx].y = fk[inidx].y / kervalue;
-    }
+template<typename T>
+__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw,
+                           cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2) {
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt;
+       i += blockDim.x * gridDim.x) {
+    int k1     = i % ms;
+    int k2     = i / ms;
+    int inidx  = k1 + k2 * ms;
+    int w1     = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
+    int w2     = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
+    int outidx = w1 + w2 * nf1;
+
+    T kervalue   = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)];
+    fw[outidx].x = fk[inidx].x / kervalue;
+    fw[outidx].y = fk[inidx].y / kervalue;
+  }
 }
 
-template <typename T>
-__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                           T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) {
-        int k1 = i % ms;
-        int k2 = (i / ms) % mt;
-        int k3 = (i / ms / mt);
-        int inidx = k1 + k2 * ms + k3 * ms * mt;
-        int w1 = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
-        int w2 = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
-        int w3 = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2;
-        int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
-
-        T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] * fwkerhalf3[abs(k3 - mu / 2)];
-        fw[outidx].x = fk[inidx].x / kervalue;
-        fw[outidx].y = fk[inidx].y / kervalue;
-    }
+template<typename T>
+__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
+                           cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
+                           T *fwkerhalf2, T *fwkerhalf3) {
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu;
+       i += blockDim.x * gridDim.x) {
+    int k1     = i % ms;
+    int k2     = (i / ms) % mt;
+    int k3     = (i / ms / mt);
+    int inidx  = k1 + k2 * ms + k3 * ms * mt;
+    int w1     = k1 - ms / 2 >= 0 ? k1 - ms / 2 : nf1 + k1 - ms / 2;
+    int w2     = k2 - mt / 2 >= 0 ? k2 - mt / 2 : nf2 + k2 - mt / 2;
+    int w3     = k3 - mu / 2 >= 0 ? k3 - mu / 2 : nf3 + k3 - mu / 2;
+    int outidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
+
+    T kervalue = fwkerhalf1[abs(k1 - ms / 2)] * fwkerhalf2[abs(k2 - mt / 2)] *
+                 fwkerhalf3[abs(k3 - mu / 2)];
+    fw[outidx].x = fk[inidx].x / kervalue;
+    fw[outidx].y = fk[inidx].y / kervalue;
+  }
 }
 
-template <typename T>
+template<typename T>
 int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 1D.
@@ -115,29 +127,30 @@ int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 11/21/21
 */
 {
-    auto &stream = d_plan->stream;
-
-    int ms = d_plan->ms;
-    int nf1 = d_plan->nf1;
-    int nmodes = ms;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    if (d_plan->spopts.spread_direction == 1) {
-        for (int t = 0; t < blksize; t++) {
-            deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
-                                                                        d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
-        }
-    } else {
-        checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream));
-        for (int t = 0; t < blksize; t++) {
-            amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
-                                                                     d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
-        }
+  auto &stream = d_plan->stream;
+
+  int ms           = d_plan->ms;
+  int nf1          = d_plan->nf1;
+  int nmodes       = ms;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  if (d_plan->spopts.spread_direction == 1) {
+    for (int t = 0; t < blksize; t++) {
+      deconvolve_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, nf1, d_plan->fw + t * nf1, d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
+    }
+  } else {
+    checkCudaErrors(cudaMemsetAsync(
+        d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream));
+    for (int t = 0; t < blksize; t++) {
+      amplify_1d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, nf1, d_plan->fw + t * nf1, d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
     }
-    return 0;
+  }
+  return 0;
 }
 
-template <typename T>
+template<typename T>
 int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 2D.
@@ -145,33 +158,34 @@ int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    auto &stream = d_plan->stream;
-
-    int ms = d_plan->ms;
-    int mt = d_plan->mt;
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nmodes = ms * mt;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    if (d_plan->spopts.spread_direction == 1) {
-        for (int t = 0; t < blksize; t++) {
-            deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
-                                                                        d_plan->fk + t * nmodes, d_plan->fwkerhalf1,
-                                                                        d_plan->fwkerhalf2);
-        }
-    } else {
-        checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream));
-        for (int t = 0; t < blksize; t++) {
-            amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
-                                                                     d_plan->fk + t * nmodes, d_plan->fwkerhalf1,
-                                                                     d_plan->fwkerhalf2);
-        }
+  auto &stream = d_plan->stream;
+
+  int ms           = d_plan->ms;
+  int mt           = d_plan->mt;
+  int nf1          = d_plan->nf1;
+  int nf2          = d_plan->nf2;
+  int nmodes       = ms * mt;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  if (d_plan->spopts.spread_direction == 1) {
+    for (int t = 0; t < blksize; t++) {
+      deconvolve_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes,
+          d_plan->fwkerhalf1, d_plan->fwkerhalf2);
     }
-    return 0;
+  } else {
+    checkCudaErrors(cudaMemsetAsync(
+        d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream));
+    for (int t = 0; t < blksize; t++) {
+      amplify_2d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes,
+          d_plan->fwkerhalf1, d_plan->fwkerhalf2);
+    }
+  }
+  return 0;
 }
 
-template <typename T>
+template<typename T>
 int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 3D.
@@ -179,32 +193,34 @@ int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    auto &stream = d_plan->stream;
-
-    int ms = d_plan->ms;
-    int mt = d_plan->mt;
-    int mu = d_plan->mu;
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int nmodes = ms * mt * mu;
-    int maxbatchsize = d_plan->maxbatchsize;
-    if (d_plan->spopts.spread_direction == 1) {
-        for (int t = 0; t < blksize; t++) {
-            deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
-                ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
-                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3);
-        }
-    } else {
-        checkCudaErrors(
-            cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>), stream));
-        for (int t = 0; t < blksize; t++) {
-            amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
-                ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
-                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3);
-        }
+  auto &stream = d_plan->stream;
+
+  int ms           = d_plan->ms;
+  int mt           = d_plan->mt;
+  int mu           = d_plan->mu;
+  int nf1          = d_plan->nf1;
+  int nf2          = d_plan->nf2;
+  int nf3          = d_plan->nf3;
+  int nmodes       = ms * mt * mu;
+  int maxbatchsize = d_plan->maxbatchsize;
+  if (d_plan->spopts.spread_direction == 1) {
+    for (int t = 0; t < blksize; t++) {
+      deconvolve_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3,
+          d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
+          d_plan->fwkerhalf3);
+    }
+  } else {
+    checkCudaErrors(cudaMemsetAsync(
+        d_plan->fw, 0, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>), stream));
+    for (int t = 0; t < blksize; t++) {
+      amplify_3d<<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3,
+          d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
+          d_plan->fwkerhalf3);
     }
-    return 0;
+  }
+  return 0;
 }
 
 template int cudeconvolve1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index a00fa526e..ea2170b9b 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -11,7 +11,7 @@
 namespace cufinufft {
 namespace memtransfer {
 
-template <typename T>
+template<typename T>
 int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "plan" stage.
@@ -19,53 +19,60 @@ int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 11/21/21
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-
-    int ier;
-    int nf1 = d_plan->nf1;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-            if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream))))
-                goto finalize;
-            if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream))))
-                goto finalize;
-        }
-    } break;
-    case 2: {
-        int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numbins * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (numbins + 1) * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "err: invalid method " << std::endl;
-    }
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+
+  int ier;
+  int nf1          = d_plan->nf1;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort) {
+      int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
+      if ((ier = checkCudaErrors(
+               cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream))))
+        goto finalize;
+      if ((ier = checkCudaErrors(
+               cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream))))
+        goto finalize;
     }
+  } break;
+  case 2: {
+    int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->numsubprob, numbins * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->subprobstartpts, (numbins + 1) * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "err: invalid method " << std::endl;
+  }
+
+  if (!d_plan->opts.gpu_spreadinterponly) {
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->fw, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "setNUpts" stage.
@@ -73,41 +80,43 @@ int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 11/21/21
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-
-    int M = d_plan->M;
-    CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
-    CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort &&
-            (ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 2: {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem1d_nupts] error: invalid method\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+
+  int M = d_plan->M;
+  CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
+  CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort && (ier = checkCudaErrors(cudaMallocAsync(
+                                      &d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 2: {
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem1d_nupts] error: invalid method\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "plan" stage.
@@ -115,66 +124,70 @@ int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            int numbins[2];
-            numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-            numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
-            if ((ier =
-                     checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream))))
-                goto finalize;
-            if ((ier = checkCudaErrors(
-                     cudaMallocAsync(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream))))
-                goto finalize;
-        }
-    } break;
-    case 2: {
-        int64_t numbins[2];
-        numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-        numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
-        if ((ier =
-                 checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numbins[0] * numbins[1] * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream))))
-            goto finalize;
-        if ((ier =
-                 checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(
-                 cudaMallocAsync(&d_plan->subprobstartpts, (numbins[0] * numbins[1] + 1) * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem2d_plan] error: invalid method\n";
-    }
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        if ((ier = checkCudaErrors(
-                 cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+
+  int nf1          = d_plan->nf1;
+  int nf2          = d_plan->nf2;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort) {
+      int numbins[2];
+      numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
+      numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
+      if ((ier = checkCudaErrors(cudaMallocAsync(
+               &d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream))))
+        goto finalize;
+      if ((ier = checkCudaErrors(cudaMallocAsync(
+               &d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream))))
+        goto finalize;
     }
+  } break;
+  case 2: {
+    int64_t numbins[2];
+    numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
+    numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->numsubprob, numbins[0] * numbins[1] * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts,
+                                             (numbins[0] * numbins[1] + 1) * sizeof(int),
+                                             stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem2d_plan] error: invalid method\n";
+  }
+
+  if (!d_plan->opts.gpu_spreadinterponly) {
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->fw, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "setNUpts" stage.
@@ -182,41 +195,43 @@ int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-
-    const int M = d_plan->M;
-
-    CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
-    CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort &&
-            (ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 2: {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem2d_nupts] error: invalid method\n";
-    }
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+
+  const int M = d_plan->M;
+
+  CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
+  CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort && (ier = checkCudaErrors(cudaMallocAsync(
+                                      &d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 2: {
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem2d_nupts] error: invalid method\n";
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "plan" stage.
@@ -224,89 +239,104 @@ int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) *
-                                      ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
-                                      ceil((T)nf3 / d_plan->opts.gpu_binsizez);
-            if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream))))
-                goto finalize;
-            if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream))))
-                goto finalize;
-        }
-    } break;
-    case 2: {
-        const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) * ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
-                                  ceil((T)nf3 / d_plan->opts.gpu_binsizez);
-
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, nbins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (nbins_tot + 1) * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 4: {
-        const int numobins[3] = {(int)ceil((T)nf1 / d_plan->opts.gpu_obinsizex),
-                                 (int)ceil((T)nf2 / d_plan->opts.gpu_obinsizey),
-                                 (int)ceil((T)nf3 / d_plan->opts.gpu_obinsizez)};
-
-        const int binsperobins[3] = {d_plan->opts.gpu_obinsizex / d_plan->opts.gpu_binsizex,
-                                     d_plan->opts.gpu_obinsizey / d_plan->opts.gpu_binsizey,
-                                     d_plan->opts.gpu_obinsizez / d_plan->opts.gpu_binsizez};
-
-        const int numbins[3] = {numobins[0] * (binsperobins[0] + 2), numobins[1] * (binsperobins[1] + 2),
-                                numobins[2] * (binsperobins[2] + 2)};
-
-        const int64_t numobins_tot = numobins[0] * numobins[1] * numobins[2];
-        const int64_t numbins_tot = numbins[0] * numbins[1] * numbins[2];
-
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numobins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, (numbins_tot + 1) * sizeof(int), stream))))
-            goto finalize;
-        if ((ier =
-                 checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (numobins_tot + 1) * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem3d_plan] error: invalid method\n";
-    }
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        if ((ier = checkCudaErrors(
-                 cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf3, (nf3 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+
+  int nf1          = d_plan->nf1;
+  int nf2          = d_plan->nf2;
+  int nf3          = d_plan->nf3;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort) {
+      const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) *
+                                ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
+                                ceil((T)nf3 / d_plan->opts.gpu_binsizez);
+      if ((ier = checkCudaErrors(
+               cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream))))
+        goto finalize;
+      if ((ier = checkCudaErrors(
+               cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream))))
+        goto finalize;
     }
+  } break;
+  case 2: {
+    const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) *
+                              ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
+                              ceil((T)nf3 / d_plan->opts.gpu_binsizez);
+
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->numsubprob, nbins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->subprobstartpts, (nbins_tot + 1) * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 4: {
+    const int numobins[3] = {(int)ceil((T)nf1 / d_plan->opts.gpu_obinsizex),
+                             (int)ceil((T)nf2 / d_plan->opts.gpu_obinsizey),
+                             (int)ceil((T)nf3 / d_plan->opts.gpu_obinsizez)};
+
+    const int binsperobins[3] = {d_plan->opts.gpu_obinsizex / d_plan->opts.gpu_binsizex,
+                                 d_plan->opts.gpu_obinsizey / d_plan->opts.gpu_binsizey,
+                                 d_plan->opts.gpu_obinsizez / d_plan->opts.gpu_binsizez};
+
+    const int numbins[3] = {numobins[0] * (binsperobins[0] + 2),
+                            numobins[1] * (binsperobins[1] + 2),
+                            numobins[2] * (binsperobins[2] + 2)};
+
+    const int64_t numobins_tot = numobins[0] * numobins[1] * numobins[2];
+    const int64_t numbins_tot  = numbins[0] * numbins[1] * numbins[2];
+
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->numsubprob, numobins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binsize, numbins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->binstartpts, (numbins_tot + 1) * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->subprobstartpts, (numobins_tot + 1) * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem3d_plan] error: invalid method\n";
+  }
+
+  if (!d_plan->opts.gpu_spreadinterponly) {
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fw,
+                             maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>),
+                             stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf3, (nf3 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "setNUpts" stage.
@@ -314,44 +344,47 @@ int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-    int M = d_plan->M;
-
-    CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
-    CUDA_FREE_AND_NULL(d_plan->idxnupts, stream)
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort &&
-            ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 2: {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 4: {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem3d_nupts] error: invalid method\n";
-    }
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+  int M = d_plan->M;
+
+  CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
+  CUDA_FREE_AND_NULL(d_plan->idxnupts, stream)
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort && ((ier = checkCudaErrors(cudaMallocAsync(
+                                       &d_plan->sortidx, M * sizeof(int), stream)))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 2: {
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 4: {
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem3d_nupts] error: invalid method\n";
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 void freegpumemory(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for freeing gpu memory.
@@ -359,24 +392,24 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
     Melody Shih 11/21/21
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-
-    CUDA_FREE_AND_NULL(d_plan->fw, stream);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream);
-
-    CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
-    CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
-    CUDA_FREE_AND_NULL(d_plan->numsubprob, stream);
-    CUDA_FREE_AND_NULL(d_plan->binsize, stream);
-    CUDA_FREE_AND_NULL(d_plan->binstartpts, stream);
-    CUDA_FREE_AND_NULL(d_plan->subprob_to_bin, stream);
-    CUDA_FREE_AND_NULL(d_plan->subprobstartpts, stream);
-
-    CUDA_FREE_AND_NULL(d_plan->numnupts, stream);
-    CUDA_FREE_AND_NULL(d_plan->numsubprob, stream);
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+
+  CUDA_FREE_AND_NULL(d_plan->fw, stream);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream);
+
+  CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
+  CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
+  CUDA_FREE_AND_NULL(d_plan->numsubprob, stream);
+  CUDA_FREE_AND_NULL(d_plan->binsize, stream);
+  CUDA_FREE_AND_NULL(d_plan->binstartpts, stream);
+  CUDA_FREE_AND_NULL(d_plan->subprob_to_bin, stream);
+  CUDA_FREE_AND_NULL(d_plan->subprobstartpts, stream);
+
+  CUDA_FREE_AND_NULL(d_plan->numnupts, stream);
+  CUDA_FREE_AND_NULL(d_plan->numsubprob, stream);
 }
 
 template int allocgpumem1d_plan<float>(cufinufft_plan_t<float> *d_plan);
diff --git a/src/cuda/precision_independent.cu b/src/cuda/precision_independent.cu
index 1ab2865e0..66cc5ca69 100644
--- a/src/cuda/precision_independent.cu
+++ b/src/cuda/precision_independent.cu
@@ -18,216 +18,237 @@ __device__ RT carg(const CT &z) { return (RT)atan2(ipart(z), rpart(z)); } // pol
 __device__ RT cabs(const CT &z) { return (RT)cuCabs(z); }
 
 /* Common Kernels from spreadinterp3d */
-__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony, int onz, int bnx, int bny,
-                                          int bnz) {
-    int oix, oiy, oiz;
-    oix = xidx / bnx;
-    oiy = yidx / bny;
-    oiz = zidx / bnz;
-    return (oix + oiy * onx + oiz * ony * onx) * (bnx * bny * bnz) +
-           (xidx % bnx + yidx % bny * bnx + zidx % bnz * bny * bnx);
+__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony,
+                                          int onz, int bnx, int bny, int bnz) {
+  int oix, oiy, oiz;
+  oix = xidx / bnx;
+  oiy = yidx / bny;
+  oiz = zidx / bnz;
+  return (oix + oiy * onx + oiz * ony * onx) * (bnx * bny * bnz) +
+         (xidx % bnx + yidx % bny * bnx + zidx % bnz * bny * bnx);
 }
 
-__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny, int nbinz) {
-    return xidx + yidx * nbinx + zidx * nbinx * nbiny;
+__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny,
+                                    int nbinz) {
+  return xidx + yidx * nbinx + zidx * nbinx * nbiny;
 }
 
 /* spreadinterp 1d */
-__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
-    }
+__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
+  }
 }
 
-__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        for (int j = 0; j < d_numsubprob[i]; j++) {
-            d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
-        }
+__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                      int *d_numsubprob, int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    for (int j = 0; j < d_numsubprob[i]; j++) {
+      d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
     }
+  }
 }
 
 __global__ void trivial_global_sort_index_1d(int M, int *index) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        index[i] = i;
-    }
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    index[i] = i;
+  }
 }
 
 /* spreadinterp 2d */
-__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
-    }
+__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
+  }
 }
 
-__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        for (int j = 0; j < d_numsubprob[i]; j++) {
-            d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
-        }
+__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                      int *d_numsubprob, int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    for (int j = 0; j < d_numsubprob[i]; j++) {
+      d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
     }
+  }
 }
 
 __global__ void trivial_global_sort_index_2d(int M, int *index) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        index[i] = i;
-    }
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    index[i] = i;
+  }
 }
 
 /* spreadinterp3d */
-__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
-    }
+__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                   int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
+  }
 }
 
-__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob,
-                                         int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        for (int j = 0; j < d_numsubprob[i]; j++) {
-            d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
-        }
+__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                         int *d_numsubprob, int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    for (int j = 0; j < d_numsubprob[i]; j++) {
+      d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
     }
+  }
 }
 
-__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size,
-                                   int *num_subprob, int maxsubprobsize, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        int numnupts = 0;
-        int binsperobin = binsperobinx * binsperobiny * binsperobinz;
-        for (int b = 0; b < binsperobin; b++) {
-            numnupts += bin_size[binsperobin * i + b];
-        }
-        num_subprob[i] = ceil(numnupts / (float)maxsubprobsize);
+__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz,
+                                   int *bin_size, int *num_subprob, int maxsubprobsize,
+                                   int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    int numnupts    = 0;
+    int binsperobin = binsperobinx * binsperobiny * binsperobinz;
+    for (int b = 0; b < binsperobin; b++) {
+      numnupts += bin_size[binsperobin * i + b];
     }
+    num_subprob[i] = ceil(numnupts / (float)maxsubprobsize);
+  }
 }
 
-__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, int *d_numsubprob,
-                                         int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        for (int j = 0; j < d_numsubprob[i]; j++) {
-            d_subprob_to_obin[d_subprobstartpts[i] + j] = i;
-        }
+__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts,
+                                         int *d_numsubprob, int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    for (int j = 0; j < d_numsubprob[i]; j++) {
+      d_subprob_to_obin[d_subprobstartpts[i] + j] = i;
     }
+  }
 }
 
 __global__ void trivial_global_sort_index_3d(int M, int *index) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        index[i] = i;
-    }
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    index[i] = i;
+  }
 }
 
-__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny,
-                                int nobinz, int *binsize) {
-    int binx = threadIdx.x + blockIdx.x * blockDim.x;
-    int biny = threadIdx.y + blockIdx.y * blockDim.y;
-    int binz = threadIdx.z + blockIdx.z * blockDim.z;
-
-    int nbinx = nobinx * binsperobinx;
-    int nbiny = nobiny * binsperobiny;
-    int nbinz = nobinz * binsperobinz;
-
-    if (binx < nbinx && biny < nbiny && binz < nbinz) {
-        int binidx =
-            calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz);
-        int i, j, k;
-        i = binx;
-        j = biny;
-        k = binz;
-        if (binx % binsperobinx == 0) {
-            i = binx - 2;
-            i = i < 0 ? i + nbinx : i;
-        }
-        if (binx % binsperobinx == binsperobinx - 1) {
-            i = binx + 2;
-            i = (i >= nbinx) ? i - nbinx : i;
-        }
-        if (biny % binsperobiny == 0) {
-            j = biny - 2;
-            j = j < 0 ? j + nbiny : j;
-        }
-        if (biny % binsperobiny == binsperobiny - 1) {
-            j = biny + 2;
-            j = (j >= nbiny) ? j - nbiny : j;
-        }
-        if (binz % binsperobinz == 0) {
-            k = binz - 2;
-            k = k < 0 ? k + nbinz : k;
-        }
-        if (binz % binsperobinz == binsperobinz - 1) {
-            k = binz + 2;
-            k = (k >= nbinz) ? k - nbinz : k;
-        }
-        int idxtoupdate = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz);
-        if (idxtoupdate != binidx) {
-            binsize[binidx] = binsize[idxtoupdate];
-        }
-    }
-}
+__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz,
+                                int nobinx, int nobiny, int nobinz, int *binsize) {
+  int binx = threadIdx.x + blockIdx.x * blockDim.x;
+  int biny = threadIdx.y + blockIdx.y * blockDim.y;
+  int binz = threadIdx.z + blockIdx.z * blockDim.z;
 
-__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny,
-                                    int nobinz, int *binsize, int *index, int *binstartpts, int M) {
-    int binx = threadIdx.x + blockIdx.x * blockDim.x;
-    int biny = threadIdx.y + blockIdx.y * blockDim.y;
-    int binz = threadIdx.z + blockIdx.z * blockDim.z;
-    int nbinx = nobinx * binsperobinx;
-    int nbiny = nobiny * binsperobiny;
-    int nbinz = nobinz * binsperobinz;
+  int nbinx = nobinx * binsperobinx;
+  int nbiny = nobiny * binsperobiny;
+  int nbinz = nobinz * binsperobinz;
 
+  if (binx < nbinx && biny < nbiny && binz < nbinz) {
+    int binidx = calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx,
+                                   binsperobiny, binsperobinz);
     int i, j, k;
-    int w = 0;
-    int box[3];
-    if (binx < nbinx && biny < nbiny && binz < nbinz) {
-        box[0] = box[1] = box[2] = 0;
-        i = binx;
-        j = biny;
-        k = binz;
-        int binidx =
-            calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz);
-        if (binx % binsperobinx == 0) {
-            i = binx - 2;
-            box[0] = (i < 0);
-            i = i < 0 ? i + nbinx : i;
-            w = 1;
-        }
-        if (binx % binsperobinx == binsperobinx - 1) {
-            i = binx + 2;
-            box[0] = (i > nbinx) * 2;
-            i = (i > nbinx) ? i - nbinx : i;
-            w = 1;
-        }
-        if (biny % binsperobiny == 0) {
-            j = biny - 2;
-            box[1] = (j < 0);
-            j = j < 0 ? j + nbiny : j;
-            w = 1;
-        }
-        if (biny % binsperobiny == binsperobiny - 1) {
-            j = biny + 2;
-            box[1] = (j > nbiny) * 2;
-            j = (j > nbiny) ? j - nbiny : j;
-            w = 1;
-        }
-        if (binz % binsperobinz == 0) {
-            k = binz - 2;
-            box[2] = (k < 0);
-            k = k < 0 ? k + nbinz : k;
-            w = 1;
-        }
-        if (binz % binsperobinz == binsperobinz - 1) {
-            k = binz + 2;
-            box[2] = (k > nbinz) * 2;
-            k = (k > nbinz) ? k - nbinz : k;
-            w = 1;
-        }
-        int corbinidx = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz);
-        if (w == 1) {
-            for (int n = 0; n < binsize[binidx]; n++) {
-                index[binstartpts[binidx] + n] =
-                    M * (box[0] + box[1] * 3 + box[2] * 9) + index[binstartpts[corbinidx] + n];
-            }
-        }
+    i = binx;
+    j = biny;
+    k = binz;
+    if (binx % binsperobinx == 0) {
+      i = binx - 2;
+      i = i < 0 ? i + nbinx : i;
+    }
+    if (binx % binsperobinx == binsperobinx - 1) {
+      i = binx + 2;
+      i = (i >= nbinx) ? i - nbinx : i;
+    }
+    if (biny % binsperobiny == 0) {
+      j = biny - 2;
+      j = j < 0 ? j + nbiny : j;
+    }
+    if (biny % binsperobiny == binsperobiny - 1) {
+      j = biny + 2;
+      j = (j >= nbiny) ? j - nbiny : j;
+    }
+    if (binz % binsperobinz == 0) {
+      k = binz - 2;
+      k = k < 0 ? k + nbinz : k;
+    }
+    if (binz % binsperobinz == binsperobinz - 1) {
+      k = binz + 2;
+      k = (k >= nbinz) ? k - nbinz : k;
+    }
+    int idxtoupdate = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx,
+                                        binsperobiny, binsperobinz);
+    if (idxtoupdate != binidx) {
+      binsize[binidx] = binsize[idxtoupdate];
+    }
+  }
+}
+
+__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz,
+                                    int nobinx, int nobiny, int nobinz, int *binsize,
+                                    int *index, int *binstartpts, int M) {
+  int binx  = threadIdx.x + blockIdx.x * blockDim.x;
+  int biny  = threadIdx.y + blockIdx.y * blockDim.y;
+  int binz  = threadIdx.z + blockIdx.z * blockDim.z;
+  int nbinx = nobinx * binsperobinx;
+  int nbiny = nobiny * binsperobiny;
+  int nbinz = nobinz * binsperobinz;
+
+  int i, j, k;
+  int w = 0;
+  int box[3];
+  if (binx < nbinx && biny < nbiny && binz < nbinz) {
+    box[0] = box[1] = box[2] = 0;
+    i                        = binx;
+    j                        = biny;
+    k                        = binz;
+    int binidx = calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx,
+                                   binsperobiny, binsperobinz);
+    if (binx % binsperobinx == 0) {
+      i      = binx - 2;
+      box[0] = (i < 0);
+      i      = i < 0 ? i + nbinx : i;
+      w      = 1;
+    }
+    if (binx % binsperobinx == binsperobinx - 1) {
+      i      = binx + 2;
+      box[0] = (i > nbinx) * 2;
+      i      = (i > nbinx) ? i - nbinx : i;
+      w      = 1;
+    }
+    if (biny % binsperobiny == 0) {
+      j      = biny - 2;
+      box[1] = (j < 0);
+      j      = j < 0 ? j + nbiny : j;
+      w      = 1;
+    }
+    if (biny % binsperobiny == binsperobiny - 1) {
+      j      = biny + 2;
+      box[1] = (j > nbiny) * 2;
+      j      = (j > nbiny) ? j - nbiny : j;
+      w      = 1;
+    }
+    if (binz % binsperobinz == 0) {
+      k      = binz - 2;
+      box[2] = (k < 0);
+      k      = k < 0 ? k + nbinz : k;
+      w      = 1;
+    }
+    if (binz % binsperobinz == binsperobinz - 1) {
+      k      = binz + 2;
+      box[2] = (k > nbinz) * 2;
+      k      = (k > nbinz) ? k - nbinz : k;
+      w      = 1;
+    }
+    int corbinidx = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx,
+                                      binsperobiny, binsperobinz);
+    if (w == 1) {
+      for (int n = 0; n < binsize[binidx]; n++) {
+        index[binstartpts[binidx] + n] =
+            M * (box[0] + box[1] * 3 + box[2] * 9) + index[binstartpts[corbinidx] + n];
+      }
     }
+  }
 }
 
 } // namespace common
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index f129f73b7..5a1c9a08e 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -13,7 +13,7 @@
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth)
 // Initializes spreader kernel parameters given desired NUFFT tolerance eps,
 // upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), and ker eval meth
@@ -22,70 +22,74 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
 // Must call before any kernel evals done.
 // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h)
 {
-    if (upsampfac != 2.0) { // nonstandard sigma
-        if (kerevalmeth == 1) {
-            fprintf(stderr, "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n", __func__,
-                    upsampfac);
-            return FINUFFT_ERR_HORNER_WRONG_BETA;
-        }
-        if (upsampfac <= 1.0) {
-            fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
-            return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
-        }
-        // calling routine must abort on above errors, since opts is garbage!
-        if (upsampfac > 4.0)
-            fprintf(stderr, "[%s] warning: upsampfac=%.3g is too large to be beneficial!\n", __func__, upsampfac);
+  if (upsampfac != 2.0) { // nonstandard sigma
+    if (kerevalmeth == 1) {
+      fprintf(stderr,
+              "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n",
+              __func__, upsampfac);
+      return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
+    if (upsampfac <= 1.0) {
+      fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
+      return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
+    }
+    // calling routine must abort on above errors, since opts is garbage!
+    if (upsampfac > 4.0)
+      fprintf(stderr, "[%s] warning: upsampfac=%.3g is too large to be beneficial!\n",
+              __func__, upsampfac);
+  }
 
-    // defaults... (user can change after this function called)
-    opts.spread_direction = 1; // user should always set to 1 or 2 as desired
-    opts.upsampfac = upsampfac;
+  // defaults... (user can change after this function called)
+  opts.spread_direction = 1; // user should always set to 1 or 2 as desired
+  opts.upsampfac        = upsampfac;
 
-    // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach...
-    int ier = 0;
+  // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach...
+  int ier = 0;
 
-    constexpr T EPSILON = std::numeric_limits<T>::epsilon();
-    if (eps < EPSILON) {
-        fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n", (double)eps,
-                (double)EPSILON);
-        eps = EPSILON;
-        ier = FINUFFT_WARN_EPS_TOO_SMALL;
-    }
+  constexpr T EPSILON = std::numeric_limits<T>::epsilon();
+  if (eps < EPSILON) {
+    fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n",
+            (double)eps, (double)EPSILON);
+    eps = EPSILON;
+    ier = FINUFFT_WARN_EPS_TOO_SMALL;
+  }
 
-    // Set kernel width w (aka ns) and ES kernel beta parameter, in opts...
-    int ns = std::ceil(-log10(eps / (T)10.0));                           // 1 digit per power of ten
-    if (upsampfac != 2.0)                                                // override ns for custom sigma
-        ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1 - 1 / upsampfac))); // formula, gamma=1
-    ns = std::max(2, ns);                                                // we don't have ns=1 version yet
-    if (ns > MAX_NSPREAD) {                                              // clip to match allocated arrays
-        fprintf(stderr, "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; clipping to max %d.\n",
-                __func__, upsampfac, (double)eps, ns, MAX_NSPREAD);
-        ns = MAX_NSPREAD;
-        ier = FINUFFT_WARN_EPS_TOO_SMALL;
-    }
-    opts.nspread = ns;
-    opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner)
-    opts.ES_c = 4.0 / (T)(ns * ns);
+  // Set kernel width w (aka ns) and ES kernel beta parameter, in opts...
+  int ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of ten
+  if (upsampfac != 2.0)                      // override ns for custom sigma
+    ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1 - 1 / upsampfac))); // formula, gamma=1
+  ns = std::max(2, ns);   // we don't have ns=1 version yet
+  if (ns > MAX_NSPREAD) { // clip to match allocated arrays
+    fprintf(stderr,
+            "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; "
+            "clipping to max %d.\n",
+            __func__, upsampfac, (double)eps, ns, MAX_NSPREAD);
+    ns  = MAX_NSPREAD;
+    ier = FINUFFT_WARN_EPS_TOO_SMALL;
+  }
+  opts.nspread      = ns;
+  opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner)
+  opts.ES_c         = 4.0 / (T)(ns * ns);
 
-    T betaoverns = 2.30; // gives decent betas for default sigma=2.0
-    if (ns == 2)
-        betaoverns = 2.20; // some small-width tweaks...
-    if (ns == 3)
-        betaoverns = 2.26;
-    if (ns == 4)
-        betaoverns = 2.38;
-    if (upsampfac != 2.0) {                                       // again, override beta for custom sigma
-        T gamma = 0.97;                                           // must match devel/gen_all_horner_C_code.m
-        betaoverns = gamma * T(M_PI) * (1 - 1 / (2 * upsampfac)); // formula based on cutoff
-    }
-    opts.ES_beta = betaoverns * (T)ns; // set the kernel beta parameter
-    // fprintf(stderr,"setup_spreader: sigma=%.6f, chose ns=%d beta=%.6f\n",(double)upsampfac,ns,(double)opts.ES_beta);
-    // // user hasn't set debug yet
-    return ier;
+  T betaoverns = 2.30;            // gives decent betas for default sigma=2.0
+  if (ns == 2) betaoverns = 2.20; // some small-width tweaks...
+  if (ns == 3) betaoverns = 2.26;
+  if (ns == 4) betaoverns = 2.38;
+  if (upsampfac != 2.0) { // again, override beta for custom sigma
+    T gamma    = 0.97;    // must match devel/gen_all_horner_C_code.m
+    betaoverns = gamma * T(M_PI) * (1 - 1 / (2 * upsampfac)); // formula based on cutoff
+  }
+  opts.ES_beta = betaoverns * (T)ns; // set the kernel beta parameter
+  // fprintf(stderr,"setup_spreader: sigma=%.6f, chose ns=%d
+  // beta=%.6f\n",(double)upsampfac,ns,(double)opts.ES_beta);
+  // // user hasn't set debug yet
+  return ier;
 }
 
-template int setup_spreader(finufft_spread_opts &opts, float eps, float upsampfac, int kerevalmeth);
-template int setup_spreader(finufft_spread_opts &opts, double eps, double upsampfac, int kerevalmeth);
+template int setup_spreader(finufft_spread_opts &opts, float eps, float upsampfac,
+                            int kerevalmeth);
+template int setup_spreader(finufft_spread_opts &opts, double eps, double upsampfac,
+                            int kerevalmeth);
 template float evaluate_kernel(float x, const finufft_spread_opts &opts);
 template double evaluate_kernel(double x, const finufft_spread_opts &opts);
 
diff --git a/src/cuda/utils.cpp b/src/cuda/utils.cpp
index 1c10f3453..9c3003cb8 100644
--- a/src/cuda/utils.cpp
+++ b/src/cuda/utils.cpp
@@ -9,23 +9,18 @@ CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b)
 // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
 // added condition about b Melody 05/31/20
 {
-    if (n <= 2)
-        return 2;
-    if (n % 2 == 1)
-        n += 1;                     // even
-    CUFINUFFT_BIGINT nplus = n - 2; // to cancel out the +=2 at start of loop
-    CUFINUFFT_BIGINT numdiv = 2;    // a dummy that is >1
-    while ((numdiv > 1) || (nplus % b != 0)) {
-        nplus += 2; // stays even
-        numdiv = nplus;
-        while (numdiv % 2 == 0)
-            numdiv /= 2; // remove all factors of 2,3,5...
-        while (numdiv % 3 == 0)
-            numdiv /= 3;
-        while (numdiv % 5 == 0)
-            numdiv /= 5;
-    }
-    return nplus;
+  if (n <= 2) return 2;
+  if (n % 2 == 1) n += 1;                // even
+  CUFINUFFT_BIGINT nplus  = n - 2;       // to cancel out the +=2 at start of loop
+  CUFINUFFT_BIGINT numdiv = 2;           // a dummy that is >1
+  while ((numdiv > 1) || (nplus % b != 0)) {
+    nplus += 2;                          // stays even
+    numdiv = nplus;
+    while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5...
+    while (numdiv % 3 == 0) numdiv /= 3;
+    while (numdiv % 5 == 0) numdiv /= 5;
+  }
+  return nplus;
 }
 
 // ----------------------- helpers for timing (always stay double prec)...
@@ -35,19 +30,19 @@ void CNTime::start() { gettimeofday(&initial, 0); }
 double CNTime::restart()
 // Barnett changed to returning in sec
 {
-    double delta = this->elapsedsec();
-    this->start();
-    return delta;
+  double delta = this->elapsedsec();
+  this->start();
+  return delta;
 }
 
 double CNTime::elapsedsec()
 // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
 {
-    struct timeval now;
-    gettimeofday(&now, 0);
-    double nowsec = (double)now.tv_sec + 1e-6 * now.tv_usec;
-    double initialsec = (double)initial.tv_sec + 1e-6 * initial.tv_usec;
-    return nowsec - initialsec;
+  struct timeval now;
+  gettimeofday(&now, 0);
+  double nowsec     = (double)now.tv_sec + 1e-6 * now.tv_usec;
+  double initialsec = (double)initial.tv_sec + 1e-6 * initial.tv_usec;
+  return nowsec - initialsec;
 }
 
 } // namespace utils
diff --git a/src/finufft.cpp b/src/finufft.cpp
index 5b33ef126..8b9c6006b 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -4,19 +4,19 @@
 // private headers for lib build
 // (must come after finufft.h which clobbers FINUFFT* macros)
 #include <finufft/defs.h>
+#include <finufft/fftw_defs.h>
+#include <finufft/spreadinterp.h>
 #include <finufft/utils.h>
 #include <finufft/utils_precindep.h>
-#include <finufft/spreadinterp.h>
-#include <finufft/fftw_defs.h>
 
-#include <iostream>
+#include "../contrib/legendre_rule_fast.h"
 #include <iomanip>
+#include <iostream>
 #include <math.h>
 #include <mutex>
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
-#include "../contrib/legendre_rule_fast.h"
 
 using namespace std;
 using namespace finufft;
@@ -24,7 +24,6 @@ using namespace finufft::utils;
 using namespace finufft::spreadinterp;
 using namespace finufft::quadrature;
 
-
 /* Computational core for FINUFFT.
 
    Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus
@@ -86,18 +85,16 @@ Design notes for guru interface implementation:
   state apart from that associated with FFTW (and the did_fftw_init).
 */
 
-
-
 // ---------- local math routines (were in common.cpp; no need now): --------
 
 namespace finufft {
-  namespace common {
+namespace common {
 
-  // Technically global state...
-  // Needs to be static to avoid name collision with SINGLE/DOUBLE
-  static std::mutex fftw_lock;
+// Technically global state...
+// Needs to be static to avoid name collision with SINGLE/DOUBLE
+static std::mutex fftw_lock;
 
-  // We macro because it has no FLT args but gets compiled for both prec's...
+// We macro because it has no FLT args but gets compiled for both prec's...
 #ifdef SINGLE
 #define SET_NF_TYPE12 set_nf_type12f
 #else
@@ -108,18 +105,22 @@ int SET_NF_TYPE12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, BIGI
 // and requested number of Fourier modes ms. Returns 0 if success, else an
 // error code if nf was unreasonably big (& tell the world).
 {
-  *nf = (BIGINT)(opts.upsampfac*ms);       // manner of rounding not crucial
-  if (*nf<2*spopts.nspread) *nf=2*spopts.nspread; // otherwise spread fails
-  if (*nf<MAX_NF) {
-    *nf = next235even(*nf);                       // expensive at huge nf
+  *nf = (BIGINT)(opts.upsampfac * ms); // manner of rounding not crucial
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
+  if (*nf < MAX_NF) {
+    *nf = next235even(*nf);                               // expensive at huge nf
     return 0;
   } else {
-    fprintf(stderr,"[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a malloc\n",__func__,(double)*nf,(double)MAX_NF);
+    fprintf(
+        stderr,
+        "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a malloc\n",
+        __func__, (double)*nf, (double)MAX_NF);
     return FINUFFT_ERR_MAXNALLOC;
   }
 }
 
-int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts opts, int dim)
+int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts opts,
+                             int dim)
 // Set up the spreader parameters given eps, and pass across various nufft
 // options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17
 {
@@ -127,22 +128,24 @@ int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts
   int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth,
                            opts.spread_debug, opts.showwarn, dim);
   // override various spread opts from their defaults...
-  spopts.debug = opts.spread_debug;
-  spopts.sort = opts.spread_sort;     // could make dim or CPU choices here?
-  spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0)
-  spopts.chkbnds = opts.chkbnds;
-  spopts.nthreads = opts.nthreads;    // 0 passed in becomes omp max by here
-  if (opts.spread_nthr_atomic>=0)     // overrides
+  spopts.debug    = opts.spread_debug;
+  spopts.sort     = opts.spread_sort;   // could make dim or CPU choices here?
+  spopts.kerpad   = opts.spread_kerpad; // (only applies to kerevalmeth=0)
+  spopts.chkbnds  = opts.chkbnds;
+  spopts.nthreads = opts.nthreads;      // 0 passed in becomes omp max by here
+  if (opts.spread_nthr_atomic >= 0)     // overrides
     spopts.atomic_threshold = opts.spread_nthr_atomic;
-  if (opts.spread_max_sp_size>0)      // overrides
+  if (opts.spread_max_sp_size > 0)      // overrides
     spopts.max_subproblem_size = opts.spread_max_sp_size;
-  if (opts.chkbnds != 1)              // deprecated default value hardcoded here
-    fprintf(stderr, "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",__func__);
+  if (opts.chkbnds != 1)                // deprecated default value hardcoded here
+    fprintf(stderr,
+            "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",
+            __func__);
   return ier;
-} 
+}
 
 void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts,
-		     BIGINT *nf, FLT *h, FLT *gam)
+                   BIGINT *nf, FLT *h, FLT *gam)
 /* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor),
    for type 3 only.
    Inputs:
@@ -156,26 +159,27 @@ void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts,
    New logic 6/12/17
 */
 {
-  int nss = spopts.nspread + 1;      // since ns may be odd
-  FLT Xsafe=X, Ssafe=S;              // may be tweaked locally
-  if (X==0.0)                        // logic ensures XS>=1, handle X=0 a/o S=0
-    if (S==0.0) {
-      Xsafe=1.0;
-      Ssafe=1.0;
-    } else Xsafe = max(Xsafe, 1/S);
+  int nss   = spopts.nspread + 1; // since ns may be odd
+  FLT Xsafe = X, Ssafe = S;       // may be tweaked locally
+  if (X == 0.0)                   // logic ensures XS>=1, handle X=0 a/o S=0
+    if (S == 0.0) {
+      Xsafe = 1.0;
+      Ssafe = 1.0;
+    } else
+      Xsafe = max(Xsafe, 1 / S);
   else
-    Ssafe = max(Ssafe, 1/X);
+    Ssafe = max(Ssafe, 1 / X);
   // use the safe X and S...
-  FLT nfd = 2.0*opts.upsampfac*Ssafe*Xsafe/PI + nss;
-  if (!isfinite(nfd)) nfd=0.0;                // use FLT to catch inf
+  FLT nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss;
+  if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf
   *nf = (BIGINT)nfd;
-  //printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
-  // catch too small nf, and nan or +-inf, otherwise spread fails...
-  if (*nf<2*spopts.nspread) *nf=2*spopts.nspread;
-  if (*nf<MAX_NF)                             // otherwise will fail anyway
-    *nf = next235even(*nf);                   // expensive at huge nf
-  *h = 2*PI / *nf;                            // upsampled grid spacing
-  *gam = (FLT)*nf / (2.0*opts.upsampfac*Ssafe);  // x scale fac to x'
+  // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
+  //  catch too small nf, and nan or +-inf, otherwise spread fails...
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread;
+  if (*nf < MAX_NF)                                 // otherwise will fail anyway
+    *nf = next235even(*nf);                         // expensive at huge nf
+  *h   = 2 * PI / *nf;                              // upsampled grid spacing
+  *gam = (FLT)*nf / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x'
 }
 
 void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
@@ -204,34 +208,34 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
   Fixed num_threads 7/20/20
  */
 {
-  FLT J2 = opts.nspread/2.0;            // J/2, half-width of ker z-support
+  FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
   // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q=(int)(2 + 3.0*J2);  // not sure why so large? cannot exceed MAX_NQUAD
+  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
   FLT f[MAX_NQUAD];
-  double z[2*MAX_NQUAD], w[2*MAX_NQUAD];
-  legendre_compute_glr(2*q,z,w);        // only half the nodes used, eg on (0,1)
+  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD];
+  legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1)
   std::complex<FLT> a[MAX_NQUAD];
-  for (int n=0;n<q;++n) {               // set up nodes z_n and vals f_n
-    z[n] *= J2;                         // rescale nodes
-    f[n] = J2*(FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei
-    a[n] = exp(2*PI*IMA*(FLT)(nf/2-z[n])/(FLT)nf);  // phase winding rates
+  for (int n = 0; n < q; ++n) {      // set up nodes z_n and vals f_n
+    z[n] *= J2;                      // rescale nodes
+    f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts);  // vals & quadr wei
+    a[n] = exp(2 * PI * IMA * (FLT)(nf / 2 - z[n]) / (FLT)nf); // phase winding rates
   }
-  BIGINT nout=nf/2+1;                   // how many values we're writing to
-  int nt = min(nout,(BIGINT)opts.nthreads);         // how many chunks
-  std::vector<BIGINT> brk(nt+1);        // start indices for each thread
-  for (int t=0; t<=nt; ++t)             // split nout mode indices btw threads
-    brk[t] = (BIGINT)(0.5 + nout*t/(double)nt);
+  BIGINT nout = nf / 2 + 1;                       // how many values we're writing to
+  int nt      = min(nout, (BIGINT)opts.nthreads); // how many chunks
+  std::vector<BIGINT> brk(nt + 1);                // start indices for each thread
+  for (int t = 0; t <= nt; ++t)                   // split nout mode indices btw threads
+    brk[t] = (BIGINT)(0.5 + nout * t / (double)nt);
 #pragma omp parallel num_threads(nt)
-  {                                     // each thread gets own chunk to do
+  {                                                // each thread gets own chunk to do
     int t = MY_OMP_GET_THREAD_NUM();
-    std::complex<FLT> aj[MAX_NQUAD];    // phase rotator for this thread
-    for (int n=0;n<q;++n)
-      aj[n] = pow(a[n],(FLT)brk[t]);    // init phase factors for chunk
-    for (BIGINT j=brk[t];j<brk[t+1];++j) {          // loop along output array
-      FLT x = 0.0;                      // accumulator for answer at this j
-      for (int n=0;n<q;++n) {
-        x += f[n] * 2*real(aj[n]);      // include the negative freq
-        aj[n] *= a[n];                  // wind the phases
+    std::complex<FLT> aj[MAX_NQUAD];               // phase rotator for this thread
+    for (int n = 0; n < q; ++n)
+      aj[n] = pow(a[n], (FLT)brk[t]);              // init phase factors for chunk
+    for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
+      FLT x = 0.0;                                 // accumulator for answer at this j
+      for (int n = 0; n < q; ++n) {
+        x += f[n] * 2 * real(aj[n]);               // include the negative freq
+        aj[n] *= a[n];                             // wind the phases
       }
       fwkerhalf[j] = x;
     }
@@ -259,28 +263,29 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts
   Barnett 2/8/17. openmp since cos slow 2/9/17
  */
 {
-  FLT J2 = opts.nspread/2.0;        // J/2, half-width of ker z-support
+  FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
   // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q=(int)(2 + 2.0*J2);     // > pi/2 ratio.  cannot exceed MAX_NQUAD
-  if (opts.debug) printf("q (# ker FT quadr pts) = %d\n",q);
-  FLT f[MAX_NQUAD]; double z[2*MAX_NQUAD],w[2*MAX_NQUAD];   // glr needs double
-  legendre_compute_glr(2*q,z,w);        // only half the nodes used, eg on (0,1)
-  for (int n=0;n<q;++n) {
-    z[n] *= (FLT)J2;                    // quadr nodes for [0,J/2]
-    f[n] = J2*(FLT)w[n] * evaluate_kernel((FLT)z[n], opts);  // w/ quadr weights
-    //printf("f[%d] = %.3g\n",n,f[n]);
+  int q = (int)(2 + 2.0 * J2); // > pi/2 ratio.  cannot exceed MAX_NQUAD
+  if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q);
+  FLT f[MAX_NQUAD];
+  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double
+  legendre_compute_glr(2 * q, z, w);         // only half the nodes used, eg on (0,1)
+  for (int n = 0; n < q; ++n) {
+    z[n] *= (FLT)J2;                         // quadr nodes for [0,J/2]
+    f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights
+    // printf("f[%d] = %.3g\n",n,f[n]);
   }
 #pragma omp parallel for num_threads(opts.nthreads)
-  for (BIGINT j=0;j<nk;++j) {          // loop along output array
-    FLT x = 0.0;                       // register
-    for (int n=0;n<q;++n)
-      x += f[n] * 2*cos(k[j]*(FLT)z[n]);  // pos & neg freq pair.  use FLT cos!
+  for (BIGINT j = 0; j < nk; ++j) {          // loop along output array
+    FLT x = 0.0;                             // register
+    for (int n = 0; n < q; ++n)
+      x += f[n] * 2 * cos(k[j] * (FLT)z[n]); // pos & neg freq pair.  use FLT cos!
     phihat[j] = x;
   }
-}  
+}
 
-void deconvolveshuffle1d(int dir,FLT prefac,FLT* ker, BIGINT ms,
-			 FLT *fk, BIGINT nf1, FFTW_CPX* fw, int modeord)
+void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGINT nf1,
+                         FFTW_CPX *fw, int modeord)
 /*
   if dir==1: copies fw to fk with amplification by prefac/ker
   if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
@@ -305,38 +310,40 @@ void deconvolveshuffle1d(int dir,FLT prefac,FLT* ker, BIGINT ms,
   Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17
 */
 {
-  BIGINT kmin = -ms/2, kmax = (ms-1)/2;    // inclusive range of k indices
-  if (ms==0) kmax=-1;           // fixes zero-pad for trivial no-mode case
+  BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices
+  if (ms == 0) kmax = -1;                     // fixes zero-pad for trivial no-mode case
   // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2*kmin, pn = 0;       // CMCL mode-ordering case (2* since cmplx)
-  if (modeord==1) { pp = 0; pn = 2*(kmax+1); }   // or, instead, FFT ordering
-  if (dir==1) {    // read fw, write out to fk...
-    for (BIGINT k=0;k<=kmax;++k) {                    // non-neg freqs k
-      fk[pp++] = prefac * fw[k][0] / ker[k];          // re
-      fk[pp++] = prefac * fw[k][1] / ker[k];          // im
+  BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (kmax + 1);
+  } // or, instead, FFT ordering
+  if (dir == 1) {                                   // read fw, write out to fk...
+    for (BIGINT k = 0; k <= kmax; ++k) {            // non-neg freqs k
+      fk[pp++] = prefac * fw[k][0] / ker[k];        // re
+      fk[pp++] = prefac * fw[k][1] / ker[k];        // im
+    }
+    for (BIGINT k = kmin; k < 0; ++k) {             // neg freqs k
+      fk[pn++] = prefac * fw[nf1 + k][0] / ker[-k]; // re
+      fk[pn++] = prefac * fw[nf1 + k][1] / ker[-k]; // im
     }
-    for (BIGINT k=kmin;k<0;++k) {                     // neg freqs k
-      fk[pn++] = prefac * fw[nf1+k][0] / ker[-k];     // re
-      fk[pn++] = prefac * fw[nf1+k][1] / ker[-k];     // im
+  } else { // read fk, write out to fw w/ zero padding...
+    for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where needed
+      fw[k][0] = fw[k][1] = 0.0;
     }
-  } else {    // read fk, write out to fw w/ zero padding...
-    for (BIGINT k=kmax+1; k<nf1+kmin; ++k) {  // zero pad precisely where needed
-      fw[k][0] = fw[k][1] = 0.0; }
-    for (BIGINT k=0;k<=kmax;++k) {                    // non-neg freqs k
-      fw[k][0] = prefac * fk[pp++] / ker[k];          // re
-      fw[k][1] = prefac * fk[pp++] / ker[k];          // im
+    for (BIGINT k = 0; k <= kmax; ++k) {            // non-neg freqs k
+      fw[k][0] = prefac * fk[pp++] / ker[k];        // re
+      fw[k][1] = prefac * fk[pp++] / ker[k];        // im
     }
-    for (BIGINT k=kmin;k<0;++k) {                     // neg freqs k
-      fw[nf1+k][0] = prefac * fk[pn++] / ker[-k];     // re
-      fw[nf1+k][1] = prefac * fk[pn++] / ker[-k];     // im
+    for (BIGINT k = kmin; k < 0; ++k) {             // neg freqs k
+      fw[nf1 + k][0] = prefac * fk[pn++] / ker[-k]; // re
+      fw[nf1 + k][1] = prefac * fk[pn++] / ker[-k]; // im
     }
   }
 }
 
-void deconvolveshuffle2d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
-			 BIGINT ms, BIGINT mt,
-			 FLT *fk, BIGINT nf1, BIGINT nf2, FFTW_CPX* fw,
-			 int modeord)
+void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, BIGINT mt,
+                         FLT *fk, BIGINT nf1, BIGINT nf2, FFTW_CPX *fw, int modeord)
 /*
   2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac.
 
@@ -356,25 +363,29 @@ void deconvolveshuffle2d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
   Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17
 */
 {
-  BIGINT k2min = -mt/2, k2max = (mt-1)/2;    // inclusive range of k2 indices
-  if (mt==0) k2max=-1;           // fixes zero-pad for trivial no-mode case
+  BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices
+  if (mt == 0) k2max = -1;                      // fixes zero-pad for trivial no-mode case
   // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2*k2min*ms, pn = 0;   // CMCL mode-ordering case (2* since cmplx)
-  if (modeord==1) { pp = 0; pn = 2*(k2max+1)*ms; }  // or, instead, FFT ordering
-  if (dir==2)               // zero pad needed x-lines (contiguous in memory)
-    for (BIGINT j=nf1*(k2max+1); j<nf1*(nf2+k2min); ++j)  // sweeps all dims
+  BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (k2max + 1) * ms;
+  } // or, instead, FFT ordering
+  if (dir == 2) // zero pad needed x-lines (contiguous in memory)
+    for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all dims
       fw[j][0] = fw[j][1] = 0.0;
-  for (BIGINT k2=0;k2<=k2max;++k2, pp+=2*ms)          // non-neg y-freqs
+  for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms)               // non-neg y-freqs
     // point fk and fw to the start of this y value's row (2* is for complex):
-    common::deconvolveshuffle1d(dir,prefac/ker2[k2],ker1,ms,fk + pp,nf1,&fw[nf1*k2],modeord);
-  for (BIGINT k2=k2min;k2<0;++k2, pn+=2*ms)           // neg y-freqs
-    common::deconvolveshuffle1d(dir,prefac/ker2[-k2],ker1,ms,fk + pn,nf1,&fw[nf1*(nf2+k2)],modeord);
+    common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1,
+                                &fw[nf1 * k2], modeord);
+  for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs
+    common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1,
+                                &fw[nf1 * (nf2 + k2)], modeord);
 }
 
-void deconvolveshuffle3d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
-			 FLT *ker3, BIGINT ms, BIGINT mt, BIGINT mu,
-			 FLT *fk, BIGINT nf1, BIGINT nf2, BIGINT nf3,
-			 FFTW_CPX* fw, int modeord)
+void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT ms,
+                         BIGINT mt, BIGINT mu, FLT *fk, BIGINT nf1, BIGINT nf2,
+                         BIGINT nf3, FFTW_CPX *fw, int modeord)
 /*
   3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac.
 
@@ -394,28 +405,30 @@ void deconvolveshuffle3d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
   Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17
 */
 {
-  BIGINT k3min = -mu/2, k3max = (mu-1)/2;    // inclusive range of k3 indices
-  if (mu==0) k3max=-1;           // fixes zero-pad for trivial no-mode case
+  BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices
+  if (mu == 0) k3max = -1;                      // fixes zero-pad for trivial no-mode case
   // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2*k3min*ms*mt, pn = 0; // CMCL mode-ordering (2* since cmplx)
-  if (modeord==1) { pp = 0; pn = 2*(k3max+1)*ms*mt; }  // or FFT ordering
-  BIGINT np = nf1*nf2;  // # pts in an upsampled Fourier xy-plane
-  if (dir==2)           // zero pad needed xy-planes (contiguous in memory)
-    for (BIGINT j=np*(k3max+1);j<np*(nf3+k3min);++j)  // sweeps all dims
+  BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (k3max + 1) * ms * mt;
+  } // or FFT ordering
+  BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane
+  if (dir == 2)          // zero pad needed xy-planes (contiguous in memory)
+    for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims
       fw[j][0] = fw[j][1] = 0.0;
-  for (BIGINT k3=0;k3<=k3max;++k3, pp+=2*ms*mt)      // non-neg z-freqs
+  for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt)        // non-neg z-freqs
     // point fk and fw to the start of this z value's plane (2* is for complex):
-    common::deconvolveshuffle2d(dir,prefac/ker3[k3],ker1,ker2,ms,mt,
-			fk + pp,nf1,nf2,&fw[np*k3],modeord);
-  for (BIGINT k3=k3min;k3<0;++k3, pn+=2*ms*mt)       // neg z-freqs
-    common::deconvolveshuffle2d(dir,prefac/ker3[-k3],ker1,ker2,ms,mt,
-			fk + pn,nf1,nf2,&fw[np*(nf3+k3)],modeord);
+    common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1,
+                                nf2, &fw[np * k3], modeord);
+  for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs
+    common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1,
+                                nf2, &fw[np * (nf3 + k3)], modeord);
 }
 
-
 // --------- batch helper functions for t1,2 exec: ---------------------------
 
-int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch)
+int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *cBatch)
 /*
   Spreads (or interpolates) a batch of batchSize strength vectors in cBatch
   to (or from) the batch of fine working grids p->fwBatch, using the same set of
@@ -433,19 +446,19 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch)
   // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work.
   // But when nthr_outer=1 here, omp par inside the loop sees all threads...
 #ifdef _OPENMP
-  int nthr_outer = p->opts.spread_thread==1 ? 1 : batchSize;
+  int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize;
 #endif
 #pragma omp parallel for num_threads(nthr_outer)
-  for (int i=0; i<batchSize; i++) {
-    FFTW_CPX *fwi = p->fwBatch + i*p->nf;  // start of i'th fw array in wkspace
-    CPX *ci = cBatch + i*p->nj;            // start of i'th c array in cBatch
-    spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT*)fwi, p->nj,
-                       p->X, p->Y, p->Z, (FLT*)ci, p->spopts, p->didSort);
+  for (int i = 0; i < batchSize; i++) {
+    FFTW_CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
+    CPX *ci       = cBatch + i * p->nj;     // start of i'th c array in cBatch
+    spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT *)fwi, p->nj, p->X,
+                       p->Y, p->Z, (FLT *)ci, p->spopts, p->didSort);
   }
   return 0;
 }
 
-int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
+int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX *fkBatch)
 /*
   Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch
   into each output array fk in fkBatch.
@@ -459,29 +472,25 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
 {
   // since deconvolveshuffle?d are single-thread, omp par seems to help here...
 #pragma omp parallel for num_threads(batchSize)
-  for (int i=0; i<batchSize; i++) {
-    FFTW_CPX *fwi = p->fwBatch + i*p->nf;  // start of i'th fw array in wkspace
-    CPX *fki = fkBatch + i*p->N;           // start of i'th fk array in fkBatch
-    
+  for (int i = 0; i < batchSize; i++) {
+    FFTW_CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
+    CPX *fki      = fkBatch + i * p->N;     // start of i'th fk array in fkBatch
+
     // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0...
     if (p->dim == 1)
-      deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1,
-                          p->ms, (FLT *)fki,
+      deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1, p->ms, (FLT *)fki,
                           p->nf1, fwi, p->opts.modeord);
     else if (p->dim == 2)
-      deconvolveshuffle2d(p->spopts.spread_direction,1.0, p->phiHat1,
-                          p->phiHat2, p->ms, p->mt, (FLT *)fki,
-                          p->nf1, p->nf2, fwi, p->opts.modeord);
+      deconvolveshuffle2d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, p->ms,
+                          p->mt, (FLT *)fki, p->nf1, p->nf2, fwi, p->opts.modeord);
     else
-      deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1,
-                          p->phiHat2, p->phiHat3, p->ms, p->mt, p->mu,
-                          (FLT *)fki, p->nf1, p->nf2, p->nf3,
-                          fwi, p->opts.modeord);
+      deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2,
+                          p->phiHat3, p->ms, p->mt, p->mu, (FLT *)fki, p->nf1, p->nf2,
+                          p->nf3, fwi, p->opts.modeord);
   }
   return 0;
 }
 
-
 // since this func is local only, we macro its name here...
 #ifdef SINGLE
 #define GRIDSIZE_FOR_FFTW gridsize_for_fftwf
@@ -489,21 +498,20 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
 #define GRIDSIZE_FOR_FFTW gridsize_for_fftw
 #endif
 
-int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){
-// local helper func returns a new int array of length dim, extracted from
-// the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
-  int* nf;
-  if(p->dim == 1){ 
-    nf = new int[1];
+int *GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p) {
+  // local helper func returns a new int array of length dim, extracted from
+  // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
+  int *nf;
+  if (p->dim == 1) {
+    nf    = new int[1];
     nf[0] = (int)p->nf1;
-  }
-  else if (p->dim == 2){ 
-    nf = new int[2];
+  } else if (p->dim == 2) {
+    nf    = new int[2];
     nf[0] = (int)p->nf2;
-    nf[1] = (int)p->nf1; 
-  }   // fftw enforced row major ordering, ie dims are backwards ordered
-  else{ 
-    nf = new int[3];
+    nf[1] = (int)p->nf1;
+  } // fftw enforced row major ordering, ie dims are backwards ordered
+  else {
+    nf    = new int[3];
     nf[0] = (int)p->nf3;
     nf[1] = (int)p->nf2;
     nf[2] = (int)p->nf1;
@@ -511,17 +519,12 @@ int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){
   return nf;
 }
 
-
-  }   // namespace
-}   // namespace
-
-
-
+} // namespace common
+} // namespace finufft
 
 // --------------- rest is the 5 user guru (plan) interface drivers: ---------
 // (not namespaced since have safe names finufft{f}_* )
-using namespace finufft::common;  // accesses routines defined above
-
+using namespace finufft::common; // accesses routines defined above
 
 // Marco Barbone: 5.8.2024
 // These are user-facing.
@@ -540,26 +543,26 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
   o->modeord = 0;
   o->chkbnds = 1;
 
-  o->debug = 0;
+  o->debug        = 0;
   o->spread_debug = 0;
-  o->showwarn = 1;
+  o->showwarn     = 1;
 
-  o->nthreads = 0;
-  o->fftw = FFTW_ESTIMATE; //
-  o->spread_sort = 2;
+  o->nthreads           = 0;
+  o->fftw               = FFTW_ESTIMATE; //
+  o->spread_sort        = 2;
   o->spread_kerevalmeth = 1;
-  o->spread_kerpad = 1;
-  o->upsampfac = 0.0;
-  o->spread_thread = 0;
-  o->maxbatchsize = 0;
+  o->spread_kerpad      = 1;
+  o->upsampfac          = 0.0;
+  o->spread_thread      = 0;
+  o->maxbatchsize       = 0;
   o->spread_nthr_atomic = -1;
   o->spread_max_sp_size = 0;
   // sphinx tag (don't remove): @defopts_end
 }
 
 // PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
-int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
-                     int ntrans, FLT tol, FINUFFT_PLAN *pp, finufft_opts* opts)
+int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, FLT tol,
+                     FINUFFT_PLAN *pp, finufft_opts *opts)
 // Populates the fields of finufft_plan which is pointed to by "pp".
 // opts is ptr to a finufft_opts to set options, or NULL to use defaults.
 // For some of the fields (if "auto" selected) here choose the actual setting.
@@ -567,596 +570,649 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
 // evaluates spreading kernel coefficients, and instantiates the fftw_plan
 {
   FINUFFT_PLAN p;
-  p = new FINUFFT_PLAN_S;                // allocate fresh plan struct
-  *pp = p;                               // pass out plan as ptr to plan struct
+  p   = new FINUFFT_PLAN_S; // allocate fresh plan struct
+  *pp = p;                  // pass out plan as ptr to plan struct
 
-  if (opts==NULL)                        // use default opts
+  if (opts == NULL)         // use default opts
     FINUFFT_DEFAULT_OPTS(&(p->opts));
-  else                                   // or read from what's passed in
-    p->opts = *opts;    // keep a deep copy; changing *opts now has no effect
-
-  if (p->opts.debug)    // do a hello world
-    printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",__func__);
-  
-  if((type!=1)&&(type!=2)&&(type!=3)) {
-    fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n",__func__,type);
+  else                      // or read from what's passed in
+    p->opts = *opts;        // keep a deep copy; changing *opts now has no effect
+
+  if (p->opts.debug)        // do a hello world
+    printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
+           __func__);
+
+  if ((type != 1) && (type != 2) && (type != 3)) {
+    fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
     return FINUFFT_ERR_TYPE_NOTVALID;
   }
-  if((dim!=1)&&(dim!=2)&&(dim!=3)) {
-    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n",__func__,dim);
+  if ((dim != 1) && (dim != 2) && (dim != 3)) {
+    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
     return FINUFFT_ERR_DIM_NOTVALID;
   }
-  if (ntrans<1) {
-    fprintf(stderr,"[%s] ntrans (%d) should be at least 1.\n",__func__,ntrans);
+  if (ntrans < 1) {
+    fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
     return FINUFFT_ERR_NTRANS_NOTVALID;
   }
-  
+
   // get stuff from args...
-  p->type = type;
-  p->dim = dim;
-  p->ntrans = ntrans;
-  p->tol = tol;
-  p->fftSign = (iflag>=0) ? 1 : -1;         // clean up flag input
+  p->type    = type;
+  p->dim     = dim;
+  p->ntrans  = ntrans;
+  p->tol     = tol;
+  p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
 
   // choose overall # threads...
 #ifdef _OPENMP
   int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
-  int nthr = ompmaxnthr;                    // default: use as many as OMP gives us
+  int nthr       = ompmaxnthr; // default: use as many as OMP gives us
   // (the above could be set, or suggested set, to 1 for small enough problems...)
-  if (p->opts.nthreads>0) {
-    nthr = p->opts.nthreads;                // user override, now without limit
+  if (p->opts.nthreads > 0) {
+    nthr = p->opts.nthreads; // user override, now without limit
     if (p->opts.showwarn && (nthr > ompmaxnthr))
-      fprintf(stderr,"%s warning: using opts.nthreads=%d, more than the %d OpenMP claims available; note large nthreads can be slower.\n",__func__,nthr,ompmaxnthr);
+      fprintf(stderr,
+              "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims "
+              "available; note large nthreads can be slower.\n",
+              __func__, nthr, ompmaxnthr);
   }
 #else
-  int nthr = 1;                             // always 1 thread (avoid segfault)
-  if (p->opts.nthreads>1)
-    fprintf(stderr,"%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",__func__,p->opts.nthreads);
+  int nthr = 1; // always 1 thread (avoid segfault)
+  if (p->opts.nthreads > 1)
+    fprintf(stderr,
+            "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",
+            __func__, p->opts.nthreads);
 #endif
-  p->opts.nthreads = nthr;                  // store actual # thr planned for
+  p->opts.nthreads = nthr; // store actual # thr planned for
   // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
-  
+
   // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
-  if (p->opts.maxbatchsize==0) {            // logic to auto-set best batchsize
-    p->nbatch = 1+(ntrans-1)/nthr;          // min # batches poss
-    p->batchSize = 1+(ntrans-1)/p->nbatch;  // then cut # thr in each b
-  } else {                                  // batchSize override by user
-    p->batchSize = min(p->opts.maxbatchsize,ntrans);
-    p->nbatch = 1+(ntrans-1)/p->batchSize;  // resulting # batches
+  if (p->opts.maxbatchsize == 0) {                  // logic to auto-set best batchsize
+    p->nbatch    = 1 + (ntrans - 1) / nthr;         // min # batches poss
+    p->batchSize = 1 + (ntrans - 1) / p->nbatch;    // then cut # thr in each b
+  } else {                                          // batchSize override by user
+    p->batchSize = min(p->opts.maxbatchsize, ntrans);
+    p->nbatch    = 1 + (ntrans - 1) / p->batchSize; // resulting # batches
   }
-  if (p->opts.spread_thread==0)
-    p->opts.spread_thread=2;                // our auto choice
-  if (p->opts.spread_thread!=1 && p->opts.spread_thread!=2) {
-    fprintf(stderr,"[%s] illegal opts.spread_thread!\n",__func__);
+  if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice
+  if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) {
+    fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__);
     return FINUFFT_ERR_SPREAD_THREAD_NOTVALID;
   }
 
-  if (type!=3) {    // read in user Fourier mode array sizes...
+  if (type != 3) {                      // read in user Fourier mode array sizes...
     p->ms = n_modes[0];
-    p->mt = (dim>1) ? n_modes[1] : 1;       // leave as 1 for unused dims
-    p->mu = (dim>2) ? n_modes[2] : 1;
-    p->N = p->ms*p->mt*p->mu;               // N = total # modes
+    p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims
+    p->mu = (dim > 2) ? n_modes[2] : 1;
+    p->N  = p->ms * p->mt * p->mu;      // N = total # modes
   }
-  
+
   // heuristic to choose default upsampfac... (currently two poss)
-  if (p->opts.upsampfac==0.0) {             // indicates auto-choose
-    p->opts.upsampfac=2.0;                  // default, and need for tol small
-    if (tol>=(FLT)1E-9) {                   // the tol sigma=5/4 can reach
-      if (type==3)                          // could move to setpts, more known?
-        p->opts.upsampfac=1.25;             // faster b/c smaller RAM & FFT
-      else if ((dim==1 && p->N>10000000) || (dim==2 && p->N>300000) || (dim==3 && p->N>3000000))  // type 1,2 heuristic cutoffs, double, typ tol, 12-core xeon
-        p->opts.upsampfac=1.25;
+  if (p->opts.upsampfac == 0.0) {            // indicates auto-choose
+    p->opts.upsampfac = 2.0;                 // default, and need for tol small
+    if (tol >= (FLT)1E-9) {                  // the tol sigma=5/4 can reach
+      if (type == 3)                         // could move to setpts, more known?
+        p->opts.upsampfac = 1.25;            // faster b/c smaller RAM & FFT
+      else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) ||
+               (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, typ
+                                             // tol, 12-core xeon
+        p->opts.upsampfac = 1.25;
     }
     if (p->opts.debug > 1)
-      printf("[%s] set auto upsampfac=%.2f\n",__func__,p->opts.upsampfac);
+      printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac);
   }
   // use opts to choose and write into plan's spread options...
   int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim);
-  if (ier>1)                                 // proceed if success or warning
+  if (ier > 1) // proceed if success or warning
     return ier;
 
   // set others as defaults (or unallocated for arrays)...
-  p->X = NULL; p->Y = NULL; p->Z = NULL;
-  p->phiHat1 = NULL; p->phiHat2 = NULL; p->phiHat3 = NULL;
-  p->nf1 = 1; p->nf2 = 1; p->nf3 = 1;  // crucial to leave as 1 for unused dims
-  p->sortIndices = NULL;               // used in all three types
-  
+  p->X           = NULL;
+  p->Y           = NULL;
+  p->Z           = NULL;
+  p->phiHat1     = NULL;
+  p->phiHat2     = NULL;
+  p->phiHat3     = NULL;
+  p->nf1         = 1;
+  p->nf2         = 1;
+  p->nf3         = 1;    // crucial to leave as 1 for unused dims
+  p->sortIndices = NULL; // used in all three types
+
   //  ------------------------ types 1,2: planning needed ---------------------
-  if (type==1 || type==2) {
+  if (type == 1 || type == 2) {
 
-    int nthr_fft = nthr;    // give FFTW all threads (or use o.spread_thread?)
-                            // Note: batchSize not used since might be only 1.
+    int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?)
+                         // Note: batchSize not used since might be only 1.
     // Now place FFTW initialization in a lock, courtesy of OMP. Makes FINUFFT
     // thread-safe (can be called inside OMP)
     {
-      static bool did_fftw_init = false;    // the only global state of FINUFFT
+      static bool did_fftw_init = false; // the only global state of FINUFFT
       std::lock_guard<std::mutex> lock(fftw_lock);
       if (!did_fftw_init) {
-        FFTW_INIT();            // setup FFTW global state; should only do once
-        did_fftw_init = true;   // ensure other FINUFFT threads don't clash
+        FFTW_INIT();          // setup FFTW global state; should only do once
+        did_fftw_init = true; // ensure other FINUFFT threads don't clash
       }
     }
 
     p->spopts.spread_direction = type;
 
-    if (p->opts.showwarn) {  // user warn round-off error...
-      if (EPSILON*p->ms>1.0)
-        fprintf(stderr,"%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->ms));
-      if (EPSILON*p->mt>1.0)
-        fprintf(stderr,"%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->mt));
-      if (EPSILON*p->mu>1.0)
-        fprintf(stderr,"%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->mu));
+    if (p->opts.showwarn) { // user warn round-off error...
+      if (EPSILON * p->ms > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->ms));
+      if (EPSILON * p->mt > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->mt));
+      if (EPSILON * p->mu > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->mu));
     }
-    
+
     // determine fine grid sizes, sanity check..
     int nfier = SET_NF_TYPE12(p->ms, p->opts, p->spopts, &(p->nf1));
-    if (nfier) return nfier;    // nf too big; we're done
-    p->phiHat1 = (FLT*)malloc(sizeof(FLT)*(p->nf1/2 + 1));
+    if (nfier) return nfier; // nf too big; we're done
+    p->phiHat1 = (FLT *)malloc(sizeof(FLT) * (p->nf1 / 2 + 1));
     if (dim > 1) {
       nfier = SET_NF_TYPE12(p->mt, p->opts, p->spopts, &(p->nf2));
       if (nfier) return nfier;
-      p->phiHat2 = (FLT*)malloc(sizeof(FLT)*(p->nf2/2 + 1));
+      p->phiHat2 = (FLT *)malloc(sizeof(FLT) * (p->nf2 / 2 + 1));
     }
     if (dim > 2) {
-      nfier = SET_NF_TYPE12(p->mu, p->opts, p->spopts, &(p->nf3)); 
+      nfier = SET_NF_TYPE12(p->mu, p->opts, p->spopts, &(p->nf3));
       if (nfier) return nfier;
-      p->phiHat3 = (FLT*)malloc(sizeof(FLT)*(p->nf3/2 + 1));
+      p->phiHat3 = (FLT *)malloc(sizeof(FLT) * (p->nf3 / 2 + 1));
     }
 
     if (p->opts.debug) { // "long long" here is to avoid warnings with printf...
-      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) (nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d batchSize=%d ", __func__,
-             dim, type, (long long)p->ms,(long long)p->mt,
-             (long long) p->mu, (long long)p->nf1,(long long)p->nf2,
-             (long long)p->nf3, ntrans, nthr, p->batchSize);
-      if (p->batchSize==1)          // spread_thread has no effect in this case
+      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) (nf1,nf2,nf3)=(%lld,%lld,%lld)\n   "
+             "            ntrans=%d nthr=%d batchSize=%d ",
+             __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu,
+             (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr,
+             p->batchSize);
+      if (p->batchSize == 1) // spread_thread has no effect in this case
         printf("\n");
       else
         printf(" spread_thread=%d\n", p->opts.spread_thread);
     }
 
     // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
-    CNTime timer; timer.start();
+    CNTime timer;
+    timer.start();
     onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts);
-    if (dim>1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
-    if (dim>2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
-    if (p->opts.debug) printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n",__func__,p->spopts.nspread, timer.elapsedsec());
+    if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
+    if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
+    if (p->opts.debug)
+      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread,
+             timer.elapsedsec());
 
     timer.restart();
-    p->nf = p->nf1*p->nf2*p->nf3;      // fine grid total number of points
+    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
     if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr, "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__);
+      fprintf(stderr,
+              "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              __func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
 
     p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // the big workspace
-    if (p->opts.debug) printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,(double)1E-09*sizeof(CPX)*p->nf*p->batchSize, timer.elapsedsec());
-    if(!p->fwBatch) {      // we don't catch all such mallocs, just this big one
-      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",__func__);
-      free(p->phiHat1); free(p->phiHat2); free(p->phiHat3);
+    if (p->opts.debug)
+      printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
+             (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec());
+    if (!p->fwBatch) { // we don't catch all such mallocs, just this big one
+      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",
+              __func__);
+      free(p->phiHat1);
+      free(p->phiHat2);
+      free(p->phiHat3);
       return FINUFFT_ERR_ALLOC;
     }
-   
-    timer.restart();            // plan the FFTW
+
+    timer.restart(); // plan the FFTW
     int *ns = GRIDSIZE_FOR_FFTW(p);
-    // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, ot, onembed, ostride, odist, sign, flags 
+    // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist,
+    // ot, onembed, ostride, odist, sign, flags
     {
       std::lock_guard<std::mutex> lock(fftw_lock);
 
       // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads.
-      // FIXME: Since this might override what the user wants for fftw, we'd like to set it
-      // just for our one plan and then revert to the user value. Unfortunately
-      // fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and there isn't a convenient
-      // mechanism to probe the version
+      // FIXME: Since this might override what the user wants for fftw, we'd like to set
+      // it just for our one plan and then revert to the user value. Unfortunately
+      // fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and there isn't a
+      // convenient mechanism to probe the version
       FFTW_PLAN_TH(nthr_fft);
-      p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, p->fwBatch, NULL, 1, p->nf,
-                                       p->fftSign, p->opts.fftw);
+      p->fftwPlan =
+          FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf,
+                             p->fwBatch, NULL, 1, p->nf, p->fftSign, p->opts.fftw);
     }
-    if (p->opts.debug) printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__,p->opts.fftw, nthr_fft, timer.elapsedsec());
-    delete []ns;
-    
-  } else {  // -------------------------- type 3 (no planning) ------------
+    if (p->opts.debug)
+      printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
+             nthr_fft, timer.elapsedsec());
+    delete[] ns;
+
+  } else { // -------------------------- type 3 (no planning) ------------
 
-    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n",__func__,dim,type,ntrans);
+    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
     // in case destroy occurs before setpts, need safe dummy ptrs/plans...
-    p->CpBatch = NULL;
-    p->fwBatch = NULL;
-    p->Sp = NULL; p->Tp = NULL; p->Up = NULL;
-    p->prephase = NULL;
-    p->deconv = NULL;
+    p->CpBatch     = NULL;
+    p->fwBatch     = NULL;
+    p->Sp          = NULL;
+    p->Tp          = NULL;
+    p->Up          = NULL;
+    p->prephase    = NULL;
+    p->deconv      = NULL;
     p->innerT2plan = NULL;
     // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
     // Note we don't even know nj or nk yet, so can't do anything else!
   }
-  return ier;         // report setup_spreader status (could be warning)
+  return ier; // report setup_spreader status (could be warning)
 }
 
-
 // SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
-int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
-                   BIGINT nk, FLT* s, FLT* t, FLT* u)
+int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk,
+                   FLT *s, FLT *t, FLT *u)
 /* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for
    spreading. (The last 4 arguments are ignored.)
    For type 3: allocates internal working arrays, scales/centers the NU points
    and NU target freqs (stu), evaluates spreading kernel FT at all target freqs.
 */
 {
-  int d = p->dim;     // abbrev for spatial dim
-  CNTime timer; timer.start();
-  p->nj = nj;    // the user only now chooses how many NU (x,y,z) pts
-  if (nj<0) {
-    fprintf(stderr,"[%s] nj (%lld) cannot be negative!\n",__func__,(long long)nj);
+  int d = p->dim; // abbrev for spatial dim
+  CNTime timer;
+  timer.start();
+  p->nj = nj; // the user only now chooses how many NU (x,y,z) pts
+  if (nj < 0) {
+    fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj);
     return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-  } else if (nj>MAX_NU_PTS) {
-    fprintf(stderr,"[%s] nj (%lld) exceeds MAX_NU_PTS\n",__func__,(long long)nj);
+  } else if (nj > MAX_NU_PTS) {
+    fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj);
     return FINUFFT_ERR_NUM_NU_PTS_INVALID;
   }
-  
-  if (p->type!=3) {  // ------------------ TYPE 1,2 SETPTS -------------------
-                     // (all we can do is check and maybe bin-sort the NU pts)
-    p->X = xj;       // plan must keep pointers to user's fixed NU pts
-    p->Y = yj;
-    p->Z = zj;
+
+  if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
+                      // (all we can do is check and maybe bin-sort the NU pts)
+    p->X    = xj;     // plan must keep pointers to user's fixed NU pts
+    p->Y    = yj;
+    p->Z    = zj;
     int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
-    if (p->opts.debug>1) printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, timer.elapsedsec());
-    if (ier)         // no warnings allowed here
-      return ier;    
+    if (p->opts.debug > 1)
+      printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds,
+             timer.elapsedsec());
+    if (ier) // no warnings allowed here
+      return ier;
     timer.restart();
-    // Free sortIndices if it has been allocated before in case of repeated setpts calls causing memory leak.
-    // We don't know it is the same size as before, so we have to malloc each time.
+    // Free sortIndices if it has been allocated before in case of repeated setpts calls
+    // causing memory leak. We don't know it is the same size as before, so we have to
+    // malloc each time.
     if (p->sortIndices) free(p->sortIndices);
-    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT)*p->nj);
+    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
     if (!p->sortIndices) {
-      fprintf(stderr,"[%s] failed to allocate sortIndices!\n",__func__);
+      fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__);
       return FINUFFT_ERR_SPREAD_ALLOC;
     }
-    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
-    if (p->opts.debug) printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__,p->didSort, timer.elapsedsec());
+    p->didSort =
+        indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
+    if (p->opts.debug)
+      printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
+             timer.elapsedsec());
 
-    
-  } else {   // ------------------------- TYPE 3 SETPTS -----------------------
-             // (here we can precompute pre/post-phase factors and plan the t2)
+  } else { // ------------------------- TYPE 3 SETPTS -----------------------
+           // (here we can precompute pre/post-phase factors and plan the t2)
 
-    if (nk<0) {
-      fprintf(stderr,"[%s] nk (%lld) cannot be negative!\n",__func__,(long long)nk);
+    if (nk < 0) {
+      fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk);
       return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-    } else if (nk>MAX_NU_PTS) {
-      fprintf(stderr,"[%s] nk (%lld) exceeds MAX_NU_PTS\n",__func__,(long long)nk);
+    } else if (nk > MAX_NU_PTS) {
+      fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk);
       return FINUFFT_ERR_NUM_NU_PTS_INVALID;
     }
-    p->nk = nk;     // user set # targ freq pts
-    p->S = s;       // keep pointers to user's input target pts
-    p->T = t;
-    p->U = u;
+    p->nk = nk; // user set # targ freq pts
+    p->S  = s;  // keep pointers to user's input target pts
+    p->T  = t;
+    p->U  = u;
 
     // pick x, s intervals & shifts & # fine grid pts (nf) in each dim...
-    FLT S1,S2,S3;       // get half-width X, center C, which contains {x_j}...
-    arraywidcen(nj,xj,&(p->t3P.X1),&(p->t3P.C1));
-    arraywidcen(nk,s,&S1,&(p->t3P.D1));      // same D, S, but for {s_k}
-    set_nhg_type3(S1,p->t3P.X1,p->opts,p->spopts,
-           &(p->nf1),&(p->t3P.h1),&(p->t3P.gam1));  // applies twist i)
-    p->t3P.C2 = 0.0;        // their defaults if dim 2 unused, etc
+    FLT S1, S2, S3; // get half-width X, center C, which contains {x_j}...
+    arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1));
+    arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k}
+    set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1),
+                  &(p->t3P.gam1));         // applies twist i)
+    p->t3P.C2 = 0.0;                       // their defaults if dim 2 unused, etc
     p->t3P.D2 = 0.0;
-    if (d>1) {
-      arraywidcen(nj,yj,&(p->t3P.X2),&(p->t3P.C2));     // {y_j}
-      arraywidcen(nk,t,&S2,&(p->t3P.D2));               // {t_k}
-      set_nhg_type3(S2,p->t3P.X2,p->opts,p->spopts,&(p->nf2),
-                    &(p->t3P.h2),&(p->t3P.gam2));
-    }    
+    if (d > 1) {
+      arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j}
+      arraywidcen(nk, t, &S2, &(p->t3P.D2));           // {t_k}
+      set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2),
+                    &(p->t3P.gam2));
+    }
     p->t3P.C3 = 0.0;
     p->t3P.D3 = 0.0;
-    if (d>2) {
-      arraywidcen(nj,zj,&(p->t3P.X3),&(p->t3P.C3));     // {z_j}
-      arraywidcen(nk,u,&S3,&(p->t3P.D3));               // {u_k}
-      set_nhg_type3(S3,p->t3P.X3,p->opts,p->spopts,
-                    &(p->nf3),&(p->t3P.h3),&(p->t3P.gam3));
+    if (d > 2) {
+      arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j}
+      arraywidcen(nk, u, &S3, &(p->t3P.D3));           // {u_k}
+      set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3),
+                    &(p->t3P.gam3));
     }
 
-    if (p->opts.debug) {  // report on choices of shifts, centers, etc...
-      printf("\tM=%lld N=%lld\n",(long long)nj,(long long)nk);
-      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", p->t3P.X1, p->t3P.C1,S1, p->t3P.D1, p->t3P.gam1,(long long) p->nf1);
-      if (d>1)
-        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n",p->t3P.X2, p->t3P.C2,S2, p->t3P.D2, p->t3P.gam2,(long long) p->nf2);
-      if (d>2)
-        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", p->t3P.X3, p->t3P.C3,S3, p->t3P.D3, p->t3P.gam3,(long long) p->nf3);
+    if (p->opts.debug) { // report on choices of shifts, centers, etc...
+      printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk);
+      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", p->t3P.X1,
+             p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1);
+      if (d > 1)
+        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n", p->t3P.X2,
+               p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2);
+      if (d > 2)
+        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", p->t3P.X3,
+               p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3);
     }
-    p->nf = p->nf1*p->nf2*p->nf3;      // fine grid total number of points
+    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
     if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr, "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__);
+      fprintf(stderr,
+              "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              __func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
-    if (p->fwBatch)
-      FFTW_FR(p->fwBatch);
+    if (p->fwBatch) FFTW_FR(p->fwBatch);
     p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // maybe big workspace
 
     // (note FFTW_ALLOC is not needed over malloc, but matches its type)
-    if(p->CpBatch) free(p->CpBatch);
-    p->CpBatch = (CPX*)malloc(sizeof(CPX) * nj*p->batchSize);  // batch c' work
-    if (p->opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, (double)1E-09*sizeof(CPX)*(p->nf+nj)*p->batchSize, timer.elapsedsec());
-    if(!p->fwBatch || !p->CpBatch) {
-      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n",__func__);
-      return FINUFFT_ERR_ALLOC; 
+    if (p->CpBatch) free(p->CpBatch);
+    p->CpBatch = (CPX *)malloc(sizeof(CPX) * nj * p->batchSize); // batch c' work
+    if (p->opts.debug)
+      printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__,
+             (double)1E-09 * sizeof(CPX) * (p->nf + nj) * p->batchSize,
+             timer.elapsedsec());
+    if (!p->fwBatch || !p->CpBatch) {
+      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__);
+      return FINUFFT_ERR_ALLOC;
     }
-    //printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
+    // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
 
     // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
-    if(p->X) free(p->X);
-    if(p->Sp) free(p->Sp);
-    p->X = (FLT*)malloc(sizeof(FLT)*nj);
-    p->Sp = (FLT*)malloc(sizeof(FLT)*nk);
-    if (d>1) {
-      if(p->Y) free(p->Y);
-      if(p->Tp) free(p->Tp);
-      p->Y = (FLT*)malloc(sizeof(FLT)*nj);
-      p->Tp = (FLT*)malloc(sizeof(FLT)*nk);
+    if (p->X) free(p->X);
+    if (p->Sp) free(p->Sp);
+    p->X  = (FLT *)malloc(sizeof(FLT) * nj);
+    p->Sp = (FLT *)malloc(sizeof(FLT) * nk);
+    if (d > 1) {
+      if (p->Y) free(p->Y);
+      if (p->Tp) free(p->Tp);
+      p->Y  = (FLT *)malloc(sizeof(FLT) * nj);
+      p->Tp = (FLT *)malloc(sizeof(FLT) * nk);
     }
-    if (d>2) {
-      if(p->Z) free(p->Z);
-      if(p->Up) free(p->Up);
-      p->Z = (FLT*)malloc(sizeof(FLT)*nj);
-      p->Up = (FLT*)malloc(sizeof(FLT)*nk);
+    if (d > 2) {
+      if (p->Z) free(p->Z);
+      if (p->Up) free(p->Up);
+      p->Z  = (FLT *)malloc(sizeof(FLT) * nj);
+      p->Up = (FLT *)malloc(sizeof(FLT) * nk);
     }
 
     // always shift as use gam to rescale x_j to x'_j, etc (twist iii)...
-    FLT ig1 = 1.0/p->t3P.gam1, ig2=0.0, ig3=0.0;   // "reciprocal-math" optim
-    if (d>1)
-      ig2 = 1.0/p->t3P.gam2;
-    if (d>2)
-      ig3 = 1.0/p->t3P.gam3;
+    FLT ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim
+    if (d > 1) ig2 = 1.0 / p->t3P.gam2;
+    if (d > 2) ig3 = 1.0 / p->t3P.gam3;
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT j=0;j<nj;++j) {
-      p->X[j] = (xj[j] - p->t3P.C1) * ig1;         // rescale x_j
-      if (d>1)        // (ok to do inside loop because of branch predict)
-        p->Y[j] = (yj[j]- p->t3P.C2) * ig2;        // rescale y_j
-      if (d>2)
-        p->Z[j] = (zj[j] - p->t3P.C3) * ig3;       // rescale z_j
+    for (BIGINT j = 0; j < nj; ++j) {
+      p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j
+      if (d > 1) // (ok to do inside loop because of branch predict)
+        p->Y[j] = (yj[j] - p->t3P.C2) * ig2;          // rescale y_j
+      if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j
     }
 
     // set up prephase array...
-    CPX imasign = (p->fftSign>=0) ? IMA : -IMA;             // +-i
-    if(p->prephase) free(p->prephase);
-    p->prephase = (CPX*)malloc(sizeof(CPX)*nj);
-    if (p->t3P.D1!=0.0 || p->t3P.D2!=0.0 || p->t3P.D3!=0.0) {
+    CPX imasign = (p->fftSign >= 0) ? IMA : -IMA; // +-i
+    if (p->prephase) free(p->prephase);
+    p->prephase = (CPX *)malloc(sizeof(CPX) * nj);
+    if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) {
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-      for (BIGINT j=0;j<nj;++j) {          // ... loop over src NU locs
-        FLT phase = p->t3P.D1*xj[j];
-        if (d>1)
-          phase += p->t3P.D2*yj[j];
-        if (d>2)
-          phase += p->t3P.D3*zj[j];
-        p->prephase[j] = cos(phase)+imasign*sin(phase);   // Euler e^{+-i.phase}
+      for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs
+        FLT phase = p->t3P.D1 * xj[j];
+        if (d > 1) phase += p->t3P.D2 * yj[j];
+        if (d > 2) phase += p->t3P.D3 * zj[j];
+        p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase}
       }
     } else
-      for (BIGINT j=0;j<nj;++j)
-        p->prephase[j] = (CPX)1.0;     // *** or keep flag so no mult in exec??
-      
-    // rescale the target s_k etc to s'_k etc...
+      for (BIGINT j = 0; j < nj; ++j)
+        p->prephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec??
+
+                                   // rescale the target s_k etc to s'_k etc...
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT k=0;k<nk;++k) {
-      p->Sp[k] = p->t3P.h1*p->t3P.gam1*(s[k]- p->t3P.D1);  // so |s'_k| < pi/R
-      if (d>1)
-        p->Tp[k] = p->t3P.h2*p->t3P.gam2*(t[k]- p->t3P.D2);  // so |t'_k| < pi/R
-      if (d>2)
-        p->Up[k] = p->t3P.h3*p->t3P.gam3*(u[k]- p->t3P.D3);  // so |u'_k| < pi/R
+    for (BIGINT k = 0; k < nk; ++k) {
+      p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1);   // so |s'_k| < pi/R
+      if (d > 1)
+        p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < pi/R
+      if (d > 2)
+        p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < pi/R
     }
-    
+
     // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)...
     // (exploits that FT separates because kernel is prod of 1D funcs)
-    if(p->deconv) free(p->deconv);
-    p->deconv = (CPX*)malloc(sizeof(CPX)*nk);
-    FLT *phiHatk1 = (FLT*)malloc(sizeof(FLT)*nk);  // don't confuse w/ p->phiHat
-    onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts);         // fill phiHat1
+    if (p->deconv) free(p->deconv);
+    p->deconv     = (CPX *)malloc(sizeof(CPX) * nk);
+    FLT *phiHatk1 = (FLT *)malloc(sizeof(FLT) * nk);    // don't confuse w/ p->phiHat
+    onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1
     FLT *phiHatk2 = NULL, *phiHatk3 = NULL;
-    if (d>1) {
-      phiHatk2 = (FLT*)malloc(sizeof(FLT)*nk);
-      onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts);       // fill phiHat2
+    if (d > 1) {
+      phiHatk2 = (FLT *)malloc(sizeof(FLT) * nk);
+      onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2
     }
-    if (d>2) {
-      phiHatk3 = (FLT*)malloc(sizeof(FLT)*nk);
-      onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts);       // fill phiHat3
+    if (d > 2) {
+      phiHatk3 = (FLT *)malloc(sizeof(FLT) * nk);
+      onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3
     }
-    int Cfinite = isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3);    // C can be nan or inf if M=0, no input NU pts
-    int Cnonzero = p->t3P.C1!=0.0 || p->t3P.C2!=0.0 || p->t3P.C3!=0.0;  // cen
+    int Cfinite =
+        isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan
+                                                                           // or inf if
+                                                                           // M=0, no
+                                                                           // input NU pts
+    int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT k=0;k<nk;++k) {         // .... loop over NU targ freqs
+    for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs
       FLT phiHat = phiHatk1[k];
-      if (d>1)
-        phiHat *= phiHatk2[k];
-      if (d>2)
-        phiHat *= phiHatk3[k];
+      if (d > 1) phiHat *= phiHatk2[k];
+      if (d > 2) phiHat *= phiHatk3[k];
       p->deconv[k] = (CPX)(1.0 / phiHat);
       if (Cfinite && Cnonzero) {
         FLT phase = (s[k] - p->t3P.D1) * p->t3P.C1;
-        if (d>1)
-          phase += (t[k] - p->t3P.D2) * p->t3P.C2;
-        if (d>2)
-          phase += (u[k] - p->t3P.D3) * p->t3P.C3;
-        p->deconv[k] *= cos(phase)+imasign*sin(phase);   // Euler e^{+-i.phase}
+        if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2;
+        if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3;
+        p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase}
       }
     }
-    free(phiHatk1); free(phiHatk2); free(phiHatk3);  // done w/ deconv fill
-    if (p->opts.debug) printf("[%s t3] phase & deconv factors:\t%.3g s\n",__func__,timer.elapsedsec());
+    free(phiHatk1);
+    free(phiHatk2);
+    free(phiHatk3); // done w/ deconv fill
+    if (p->opts.debug)
+      printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec());
 
     // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw...
     timer.restart();
-    // Free sortIndices if it has been allocated before in case of repeated setpts calls causing memory leak.
-    // We don't know it is the same size as before, so we have to malloc each time.
+    // Free sortIndices if it has been allocated before in case of repeated setpts calls
+    // causing memory leak. We don't know it is the same size as before, so we have to
+    // malloc each time.
     if (p->sortIndices) free(p->sortIndices);
-    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT)*p->nj);
+    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
     if (!p->sortIndices) {
-      fprintf(stderr,"[%s t3] failed to allocate sortIndices!\n",__func__);
+      fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__);
       return FINUFFT_ERR_SPREAD_ALLOC;
     }
-    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, p->Z, p->spopts);
-    if (p->opts.debug) printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n",__func__, p->didSort, timer.elapsedsec());
- 
+    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y,
+                           p->Z, p->spopts);
+    if (p->opts.debug)
+      printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
+             timer.elapsedsec());
+
     // Plan and setpts once, for the (repeated) inner type 2 finufft call...
     timer.restart();
-    BIGINT t2nmodes[] = {p->nf1,p->nf2,p->nf3};   // t2 input is actually fw
-    finufft_opts t2opts = p->opts;                  // deep copy, since not ptrs
-    t2opts.modeord = 0;                           // needed for correct t3!
-    t2opts.debug = max(0,p->opts.debug-1);        // don't print as much detail
-    t2opts.spread_debug = max(0,p->opts.spread_debug-1);
-    t2opts.showwarn = 0;                          // so don't see warnings 2x
+    BIGINT t2nmodes[]   = {p->nf1, p->nf2, p->nf3};  // t2 input is actually fw
+    finufft_opts t2opts = p->opts;                   // deep copy, since not ptrs
+    t2opts.modeord      = 0;                         // needed for correct t3!
+    t2opts.debug        = max(0, p->opts.debug - 1); // don't print as much detail
+    t2opts.spread_debug = max(0, p->opts.spread_debug - 1);
+    t2opts.showwarn     = 0;                         // so don't see warnings 2x
     // (...could vary other t2opts here?)
-    if(p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan);
+    if (p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan);
     int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol,
                                &p->innerT2plan, &t2opts);
-    if (ier>1) {     // if merely warning, still proceed
-      fprintf(stderr,"[%s t3]: inner type 2 plan creation failed with ier=%d!\n",__func__,ier);
+    if (ier > 1) { // if merely warning, still proceed
+      fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n",
+              __func__, ier);
       return ier;
     }
-    ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, NULL);  // note nk = # output points (not nj)
-    if (ier>1) {
-      fprintf(stderr,"[%s t3]: inner type 2 setpts failed, ier=%d!\n",__func__,ier);
+    ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL,
+                         NULL); // note nk = # output points (not nj)
+    if (ier > 1) {
+      fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier);
       return ier;
     }
-    if (p->opts.debug) printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__,timer.elapsedsec());
-
+    if (p->opts.debug)
+      printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec());
   }
   return 0;
 }
 // ............ end setpts ..................................................
 
-
 // EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
-int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){
-/* See ../docs/cguru.doc for current documentation.
-
-   For given (stack of) weights cj or coefficients fk, performs NUFFTs with
-   existing (sorted) NU pts and existing plan.
-   For type 1 and 3: cj is input, fk is output.
-   For type 2: fk is input, cj is output.
-   Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate
-   for each of the 3 types.
-   For cases of ntrans>1, performs work in blocks of size up to batchSize.
-   Return value 0 (no error diagnosis yet).
-   Barnett 5/20/20, based on Malleo 2019.
-*/
-  CNTime timer; timer.start();
-  
-  if (p->type!=3){ // --------------------- TYPE 1,2 EXEC ------------------
-  
-    double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0;  // accumulated timing
+int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) {
+  /* See ../docs/cguru.doc for current documentation.
+
+     For given (stack of) weights cj or coefficients fk, performs NUFFTs with
+     existing (sorted) NU pts and existing plan.
+     For type 1 and 3: cj is input, fk is output.
+     For type 2: fk is input, cj is output.
+     Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate
+     for each of the 3 types.
+     For cases of ntrans>1, performs work in blocks of size up to batchSize.
+     Return value 0 (no error diagnosis yet).
+     Barnett 5/20/20, based on Malleo 2019.
+  */
+  CNTime timer;
+  timer.start();
+
+  if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------
+
+    double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing
     if (p->opts.debug)
-      printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, p->nbatch, p->batchSize);
-    
-    for (int b=0; b*p->batchSize < p->ntrans; b++) { // .....loop b over batches
+      printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
+             p->nbatch, p->batchSize);
+
+    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
 
       // current batch is either batchSize, or possibly truncated if last one
-      int thisBatchSize = min(p->ntrans - b*p->batchSize, p->batchSize);
-      int bB = b*p->batchSize;         // index of vector, since batchsizes same
-      CPX* cjb = cj + bB*p->nj;        // point to batch of weights
-      CPX* fkb = fk + bB*p->N;         // point to batch of mode coeffs
-      if (p->opts.debug>1) printf("[%s] start batch %d (size %d):\n",__func__, b,thisBatchSize);
-      
+      int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize);
+      int bB            = b * p->batchSize; // index of vector, since batchsizes same
+      CPX *cjb          = cj + bB * p->nj;  // point to batch of weights
+      CPX *fkb          = fk + bB * p->N;   // point to batch of mode coeffs
+      if (p->opts.debug > 1)
+        printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize);
+
       // STEP 1: (varies by type)
       timer.restart();
-      if (p->type == 1) {  // type 1: spread NU pts p->X, weights cj, to fw grid
+      if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid
         spreadinterpSortedBatch(thisBatchSize, p, cjb);
         t_sprint += timer.elapsedsec();
-      } else {          //  type 2: amplify Fourier coeffs fk into 0-padded fw
+      } else { //  type 2: amplify Fourier coeffs fk into 0-padded fw
         deconvolveBatch(thisBatchSize, p, fkb);
         t_deconv += timer.elapsedsec();
       }
-             
+
       // STEP 2: call the pre-planned FFT on this batch
       timer.restart();
-      FFTW_EX(p->fftwPlan);   // if thisBatchSize<batchSize it wastes some flops
+      FFTW_EX(p->fftwPlan); // if thisBatchSize<batchSize it wastes some flops
       t_fft += timer.elapsedsec();
-      if (p->opts.debug>1)
-        printf("\tFFTW exec:\t\t%.3g s\n", timer.elapsedsec());
-      
+      if (p->opts.debug > 1) printf("\tFFTW exec:\t\t%.3g s\n", timer.elapsedsec());
+
       // STEP 3: (varies by type)
-      timer.restart();        
-      if (p->type == 1) {   // type 1: deconvolve (amplify) fw and shuffle to fk
+      timer.restart();
+      if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk
         deconvolveBatch(thisBatchSize, p, fkb);
         t_deconv += timer.elapsedsec();
-      } else {          // type 2: interpolate unif fw grid to NU target pts
+      } else { // type 2: interpolate unif fw grid to NU target pts
         spreadinterpSortedBatch(thisBatchSize, p, cjb);
-        t_sprint += timer.elapsedsec(); 
+        t_sprint += timer.elapsedsec();
       }
-    }                                                   // ........end b loop
-    
-    if (p->opts.debug) {  // report total times in their natural order...
-      if(p->type == 1) {
-        printf("[%s] done. tot spread:\t\t%.3g s\n",__func__,t_sprint);
+    } // ........end b loop
+
+    if (p->opts.debug) { // report total times in their natural order...
+      if (p->type == 1) {
+        printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint);
         printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
         printf("               tot deconvolve:\t\t\t%.3g s\n", t_deconv);
       } else {
-        printf("[%s] done. tot deconvolve:\t\t%.3g s\n",__func__,t_deconv);
+        printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv);
         printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
-        printf("               tot interp:\t\t\t%.3g s\n",t_sprint);
+        printf("               tot interp:\t\t\t%.3g s\n", t_sprint);
       }
     }
   }
 
-  else {  // ----------------------------- TYPE 3 EXEC ---------------------
+  else { // ----------------------------- TYPE 3 EXEC ---------------------
 
-    //for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long int)j,(double)real(cj[j]),(double)imag(cj[j]));  // debug
-    
-    double t_pre=0.0, t_spr=0.0, t_t2=0.0, t_deconv=0.0;  // accumulated timings
+    // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long
+    // int)j,(double)real(cj[j]),(double)imag(cj[j]));  // debug
+
+    double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, t_deconv = 0.0; // accumulated timings
     if (p->opts.debug)
-      printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n",__func__,p->ntrans, p->nbatch, p->batchSize);
+      printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
+             p->nbatch, p->batchSize);
 
-    for (int b=0; b*p->batchSize < p->ntrans; b++) { // .....loop b over batches
+    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
 
       // batching and pointers to this batch, identical to t1,2 above...
-      int thisBatchSize = min(p->ntrans - b*p->batchSize, p->batchSize);
-      int bB = b*p->batchSize;
-      CPX* cjb = cj + bB*p->nj;           // batch of input strengths
-      CPX* fkb = fk + bB*p->nk;           // batch of output strengths
-      if (p->opts.debug>1) printf("[%s t3] start batch %d (size %d):\n",__func__,b,thisBatchSize);
-      
+      int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize);
+      int bB            = b * p->batchSize;
+      CPX *cjb          = cj + bB * p->nj; // batch of input strengths
+      CPX *fkb          = fk + bB * p->nk; // batch of output strengths
+      if (p->opts.debug > 1)
+        printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize);
+
       // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch...
       timer.restart();
-#pragma omp parallel for num_threads(p->opts.nthreads)   // or p->batchSize?
-      for (int i=0; i<thisBatchSize; i++) {
-        BIGINT ioff = i*p->nj;
-        for (BIGINT j=0;j<p->nj;++j)
-          p->CpBatch[ioff+j] = p->prephase[j] * cjb[ioff+j];
+#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize?
+      for (int i = 0; i < thisBatchSize; i++) {
+        BIGINT ioff = i * p->nj;
+        for (BIGINT j = 0; j < p->nj; ++j)
+          p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j];
       }
-      t_pre += timer.elapsedsec(); 
-      
+      t_pre += timer.elapsedsec();
+
       // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid...
       timer.restart();
-      p->spopts.spread_direction = 1;                         // spread
-      spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch);  // p->X are primed
+      p->spopts.spread_direction = 1;                        // spread
+      spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed
       t_spr += timer.elapsedsec();
 
-      //for (int j=0;j<p->nf1;++j) printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]);  // debug
-   
+      // for (int j=0;j<p->nf1;++j)
+      // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]);  // debug
+
       // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
       timer.restart();
       // illegal possible shrink of ntrans *after* plan for smaller last batch:
-      p->innerT2plan->ntrans = thisBatchSize;      // do not try this at home!
+      p->innerT2plan->ntrans = thisBatchSize; // do not try this at home!
       /* (alarming that FFTW not shrunk, but safe, because t2's fwBatch array
          still the same size, as Andrea explained; just wastes a few flops) */
-      FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX*)(p->fwBatch));
+      FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX *)(p->fwBatch));
       t_t2 += timer.elapsedsec();
 
       // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
       timer.restart();
 #pragma omp parallel for num_threads(p->opts.nthreads)
-      for (int i=0; i<thisBatchSize; i++) {
-        BIGINT ioff = i*p->nk;
-        for (BIGINT k=0;k<p->nk;++k)
-          fkb[ioff+k] *= p->deconv[k];
+      for (int i = 0; i < thisBatchSize; i++) {
+        BIGINT ioff = i * p->nk;
+        for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k];
       }
       t_deconv += timer.elapsedsec();
-    }                                                   // ........end b loop
+    } // ........end b loop
 
-    if (p->opts.debug) {  // report total times in their natural order...
-      printf("[%s t3] done. tot prephase:\t\t%.3g s\n",__func__,t_pre);
-      printf("                  tot spread:\t\t\t%.3g s\n",t_spr);
+    if (p->opts.debug) { // report total times in their natural order...
+      printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre);
+      printf("                  tot spread:\t\t\t%.3g s\n", t_spr);
       printf("                  tot type 2:\t\t\t%.3g s\n", t_t2);
       printf("                  tot deconvolve:\t\t%.3g s\n", t_deconv);
-    }    
+    }
   }
-  //for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
-  
-  return 0; 
-}
+  // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long
+  // int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
 
+  return 0;
+}
 
 // DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
 int FINUFFT_DESTROY(FINUFFT_PLAN p)
@@ -1165,12 +1221,12 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
 // Thus either each thing free'd here is guaranteed to be NULL or correctly
 // allocated.
 {
-  if (!p)                // NULL ptr, so not a ptr to a plan, report error
+  if (!p) // NULL ptr, so not a ptr to a plan, report error
     return 1;
 
   FFTW_FR(p->fwBatch); // free the big FFTW (or t3 spread) working array
   free(p->sortIndices);
-  if (p->type==1 || p->type==2) {
+  if (p->type == 1 || p->type == 2) {
     {
       std::lock_guard<std::mutex> lock(fftw_lock);
       FFTW_DE(p->fftwPlan);
@@ -1178,14 +1234,18 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
     free(p->phiHat1);
     free(p->phiHat2);
     free(p->phiHat3);
-  } else {               // free the stuff alloc for type 3 only
-    FINUFFT_DESTROY(p->innerT2plan);   // if NULL, ignore its error code
+  } else {                           // free the stuff alloc for type 3 only
+    FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code
     free(p->CpBatch);
-    free(p->Sp); free(p->Tp); free(p->Up);
-    free(p->X); free(p->Y); free(p->Z);
+    free(p->Sp);
+    free(p->Tp);
+    free(p->Up);
+    free(p->X);
+    free(p->Y);
+    free(p->Z);
     free(p->prephase);
     free(p->deconv);
   }
   delete p;
-  return 0;              // success
+  return 0; // success
 }
diff --git a/src/ker_horner_allw_loop_constexpr.h b/src/ker_horner_allw_loop_constexpr.h
index 6de0540e9..25a791ddb 100644
--- a/src/ker_horner_allw_loop_constexpr.h
+++ b/src/ker_horner_allw_loop_constexpr.h
@@ -5,228 +5,909 @@
 
 template<class T, std::size_t w, std::size_t nc>
 constexpr std::array<std::array<T, w>, nc> get_horner_coeffs() noexcept {
-    if constexpr (w == 2) {
-      return std::array<std::array<T, w>, nc> {{
-          {4.5147043243215315E+01,  4.5147043243215300E+01},
-          {5.7408070938221300E+01,  -5.7408070938221293E+01},
-          {-1.8395117920046484E+00, -1.8395117920046560E+00},
-          {-2.0382426253182082E+01, 2.0382426253182086E+01},
-          {-2.0940804433577420E+00, -2.0940804433577389E+00}
-      }};
-    } else if constexpr (w == 3) {
-      return std::array<std::array<T, w>, nc> {{
-          {1.5653991189315119E+02,  8.8006872410780295E+02,  1.5653991189967152E+02},
-          {3.1653018869611077E+02,  7.4325702843759617E-14,  -3.1653018868907071E+02},
-          {1.7742692790454484E+02,  -3.3149255274727801E+02, 1.7742692791117119E+02},
-          {-1.5357716116473156E+01, 9.5071486252033243E-15,  1.5357716122720193E+01},
-          {-3.7757583061523668E+01, 5.3222970968867315E+01,  -3.7757583054647384E+01},
-          {-3.9654011076088804E+00, 1.8062124448285358E-13,  3.9654011139270540E+00}
-      }};
-    } else if constexpr (w == 4) {
-      return std::array<std::array<T, w>, nc> {{
-          {5.4284366850213200E+02,  1.0073871433088398E+04,  1.0073871433088396E+04,  5.4284366850213223E+02},
-          {1.4650917259256939E+03,  6.1905285583602863E+03,  -6.1905285583602881E+03, -1.4650917259256937E+03},
-          {1.4186910680718345E+03,  -1.3995339862725591E+03, -1.3995339862725598E+03, 1.4186910680718347E+03},
-          {5.1133995502497419E+02,  -1.4191608683682996E+03, 1.4191608683682998E+03,  -5.1133995502497424E+02},
-          {-4.8293622641174039E+01, 3.9393732546135226E+01,  3.9393732546135816E+01,  -4.8293622641174061E+01},
-          {-7.8386867802392288E+01, 1.4918904800408930E+02,  -1.4918904800408751E+02, 7.8386867802392359E+01},
-          {-1.0039212571700894E+01, 5.0626747735616746E+00,  5.0626747735625512E+00,  -1.0039212571700640E+01}
-      }};
-    } else if constexpr (w == 5) {
-      return std::array<std::array<T, w>, nc> {{
-          {9.9223677575398392E+02,  3.7794697666613320E+04,  9.8715771010760494E+04,  3.7794697666613283E+04,  9.9223677575398403E+02},
-          {3.0430174925083825E+03,  3.7938404259811403E+04,  -1.1842989705877139E-11, -3.7938404259811381E+04, -3.0430174925083829E+03},
-          {3.6092689177271222E+03,  7.7501368899498666E+03,  -2.2704627332475000E+04, 7.7501368899498730E+03,  3.6092689177271218E+03},
-          {1.9990077310495396E+03,  -3.8875294641277296E+03, 9.7116927320010791E-12,  3.8875294641277369E+03,  -1.9990077310495412E+03},
-          {4.0071733590403869E+02,  -1.5861137916762602E+03, 2.3839858699098645E+03,  -1.5861137916762643E+03, 4.0071733590403909E+02},
-          {-9.1301168206167262E+01, 1.2316471075214675E+02,  2.0698495299948402E-11,  -1.2316471075214508E+02, 9.1301168206167233E+01},
-          {-5.5339722671223846E+01, 1.1960590540261879E+02,  -1.5249941358311668E+02, 1.1960590540262307E+02,  -5.5339722671223605E+01},
-          {-3.3762488150353924E+00, 2.2839981872948751E+00,  7.1884725699454154E-12,  -2.2839981872943818E+00, 3.3762488150341459E+00}
-      }};
-    } else if constexpr (w == 6) {
-      return std::array<std::array<T, w>, nc> {{
-          {2.0553833234911876E+03,  1.5499537739913128E+05,  8.1177907023291115E+05,  8.1177907023291173E+05,  1.5499537739913136E+05,  2.0553833235005691E+03},
-          {7.1269776034442639E+03,  2.0581923258843314E+05,  3.1559612614917674E+05,  -3.1559612614917627E+05, -2.0581923258843317E+05, -7.1269776034341394E+03},
-          {1.0023404568475091E+04,  9.0916650498360192E+04,  -1.0095927514054619E+05, -1.0095927514054628E+05, 9.0916650498360177E+04,  1.0023404568484635E+04},
-          {7.2536109410387417E+03,  4.8347162752602981E+03,  -5.0512736602018522E+04, 5.0512736602018478E+04,  -4.8347162752603008E+03, -7.2536109410297540E+03},
-          {2.7021878300949752E+03,  -7.8773465553972646E+03, 5.2105876478342780E+03,  5.2105876478343343E+03,  -7.8773465553972710E+03, 2.7021878301048723E+03},
-          {3.2120291706547636E+02,  -1.8229189469936762E+03, 3.7928113414429808E+03,  -3.7928113414427025E+03, 1.8229189469937312E+03,  -3.2120291705638243E+02},
-          {-1.2051267090537374E+02, 2.2400507411399673E+02,  -1.2506575852541796E+02, -1.2506575852521925E+02, 2.2400507411398695E+02,  -1.2051267089640181E+02},
-          {-4.5977202613350237E+01, 1.1536880606853076E+02,  -1.7819720186493959E+02, 1.7819720186497622E+02,  -1.1536880606854736E+02, 4.5977202622148909E+01},
-          {-1.5631081288842275E+00, 7.1037430591266115E-01,  -6.9838401121429056E-02, -6.9838401186476856E-02, 7.1037430589285400E-01,  -1.5631081203754575E+00}
-      }};
-    } else if constexpr(w==7) {
-      return std::array<std::array<T, w>, nc> {{
-          {3.9948351830487481E+03,  5.4715865608590771E+05,  5.0196413492771760E+06,  9.8206709220713247E+06,  5.0196413492771825E+06,  5.4715865608590783E+05,  3.9948351830642519E+03},
-          {1.5290160332974696E+04,  8.7628248584320408E+05,  3.4421061790934438E+06,  -2.6908159596373561E-10, -3.4421061790934461E+06, -8.7628248584320408E+05, -1.5290160332958067E+04},
-          {2.4458227486779251E+04,  5.3904618484139396E+05,  2.4315566181017534E+05,  -1.6133959371974322E+06, 2.4315566181017453E+05,  5.3904618484139396E+05,  2.4458227486795113E+04},
-          {2.1166189345881645E+04,  1.3382732160223130E+05,  -3.3113450969689694E+05, 6.9013724510092140E-10,  3.3113450969689724E+05,  -1.3382732160223136E+05, -2.1166189345866893E+04},
-          {1.0542795672344864E+04,  -7.0739172265098678E+03, -6.5563293056049893E+04, 1.2429734005960064E+05,  -6.5563293056049602E+04, -7.0739172265098332E+03, 1.0542795672361213E+04},
-          {2.7903491906228419E+03,  -1.0975382873973093E+04, 1.3656979541144799E+04,  7.7346408577822045E-10,  -1.3656979541143772E+04, 1.0975382873973256E+04,  -2.7903491906078298E+03},
-          {1.6069721418053300E+02,  -1.5518707872251393E+03, 4.3634273936642621E+03,  -5.9891976420595174E+03, 4.3634273936642730E+03,  -1.5518707872251064E+03, 1.6069721419533221E+02},
-          {-1.2289277373867256E+02, 2.8583630927743314E+02,  -2.8318194617327981E+02, 6.9043515551118249E-10,  2.8318194617392436E+02,  -2.8583630927760140E+02, 1.2289277375319763E+02},
-          {-3.2270164914249058E+01, 9.1892112257581346E+01,  -1.6710678096334209E+02, 2.0317049305432383E+02,  -1.6710678096383771E+02, 9.1892112257416159E+01,  -3.2270164900224913E+01},
-          {-1.4761409685186277E-01, -9.1862771280377487E-01, 1.2845147741777752E+00,  5.6547359492808854E-10,  -1.2845147728310689E+00, 9.1862771293147971E-01,  1.4761410890866353E-01}
-      }};
-    } else if constexpr(w==8) {
-      return std::array<std::array<T, w>, nc> {{
-          {7.3898000697447915E+03,  1.7297637497600035E+06,  2.5578341605285794E+07,  8.4789650417103335E+07,  8.4789650417103350E+07,  2.5578341605285816E+07,  1.7297637497600049E+06,  7.3898000697447915E+03},
-          {3.0719636811267599E+04,  3.1853145713323927E+06,  2.3797981861403696E+07,  2.4569731244678464E+07,  -2.4569731244678471E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267606E+04},
-          {5.4488498478251728E+04,  2.4101183255475131E+06,  6.4554051283428287E+06,  -8.9200440393090546E+06, -8.9200440393090583E+06, 6.4554051283428324E+06,  2.4101183255475126E+06,  5.4488498478251728E+04},
-          {5.3926359802542116E+04,  9.0469037926849292E+05,  -6.0897036277696118E+05, -3.0743852105799988E+06, 3.0743852105800058E+06,  6.0897036277696711E+05,  -9.0469037926849339E+05, -5.3926359802542138E+04},
-          {3.2444118016247590E+04,  1.3079802224392134E+05,  -5.8652889370129269E+05, 4.2333306008151924E+05,  4.2333306008152053E+05,  -5.8652889370128722E+05, 1.3079802224392109E+05,  3.2444118016247590E+04},
-          {1.1864306345505294E+04,  -2.2700360645707988E+04, -5.0713607251414309E+04, 1.8308704458211688E+05,  -1.8308704458210632E+05, 5.0713607251413123E+04,  2.2700360645707628E+04,  -1.1864306345505294E+04},
-          {2.2812256770903232E+03,  -1.1569135767377773E+04, 2.0942387020798891E+04,  -1.1661592834945191E+04, -1.1661592834940149E+04, 2.0942387020801420E+04,  -1.1569135767377924E+04, 2.2812256770903286E+03},
-          {8.5503535636821422E+00,  -9.7513976461238224E+02, 3.8242995179171526E+03,  -6.9201295567267280E+03, 6.9201295567248662E+03,  -3.8242995179155446E+03, 9.7513976461209836E+02,  -8.5503535637013552E+00},
-          {-1.0230637348345023E+02, 2.8246898554269114E+02,  -3.8638201738139219E+02, 1.9106407993320320E+02,  1.9106407993289886E+02,  -3.8638201738492717E+02, 2.8246898554219217E+02,  -1.0230637348345138E+02},
-          {-1.9200143062947848E+01, 6.1692257626706223E+01,  -1.2981109187842989E+02, 1.8681284210471688E+02,  -1.8681284209654376E+02, 1.2981109187880142E+02,  -6.1692257626845532E+01, 1.9200143062947120E+01},
-          {3.7894993760177598E-01,  -1.7334408836731494E+00, 2.5271184057877303E+00,  -1.2600963971824484E+00, -1.2600963917834651E+00, 2.5271184069685657E+00,  -1.7334408840526812E+00, 3.7894993760636758E-01}
-      }};
-    } else if constexpr(w==9) {
-      return std::array<std::array<T, w>, nc> {{
-          {1.3136365370186100E+04,  5.0196413492771806E+06,  1.1303327711722563E+08,  5.8225443924996686E+08,  9.7700272582690656E+08,  5.8225443924996758E+08,  1.1303327711722568E+08,  5.0196413492772207E+06,  1.3136365370186135E+04},
-          {5.8623313038274340E+04,  1.0326318537280345E+07,  1.2898448324824864E+08,  3.0522863709830385E+08,  -3.9398045056223735E-08, -3.0522863709830391E+08, -1.2898448324824864E+08, -1.0326318537280388E+07, -5.8623313038274347E+04},
-          {1.1335001341875963E+05,  9.0726133144784812E+06,  5.3501544534038112E+07,  -2.6789524644146336E+05, -1.2483923718899371E+08, -2.6789524644172983E+05, 5.3501544534038112E+07,  9.0726133144785129E+06,  1.1335001341875960E+05},
-          {1.2489113703229747E+05,  4.3035547171861930E+06,  6.3021978510598792E+06,  -2.6014941986659057E+07, 6.0417403157325170E-08,  2.6014941986659389E+07,  -6.3021978510598652E+06, -4.3035547171862079E+06, -1.2489113703229751E+05},
-          {8.6425493435991244E+04,  1.0891182836653308E+06,  -2.0713033564200639E+06, -2.8994941183506218E+06, 7.5905338661205899E+06,  -2.8994941183505375E+06, -2.0713033564200667E+06, 1.0891182836653353E+06,  8.6425493435991288E+04},
-          {3.8657354724013814E+04,  7.9936390113331305E+04,  -7.0458265546791907E+05, 1.0151095605715880E+06,  1.2138090419648379E-07,  -1.0151095605717725E+06, 7.0458265546794771E+05,  -7.9936390113331567E+04, -3.8657354724013821E+04},
-          {1.0779131453134638E+04,  -3.3466718311300596E+04, -1.3245366619006139E+04, 1.8238470515353698E+05,  -2.9285656292977190E+05, 1.8238470515350526E+05,  -1.3245366619000662E+04, -3.3466718311299621E+04, 1.0779131453134616E+04},
-          {1.4992527030548456E+03,  -9.7024371533891372E+03, 2.3216330734057381E+04,  -2.3465262819040818E+04, 5.3299736484284360E-08,  2.3465262819251962E+04,  -2.3216330734049119E+04, 9.7024371533890644E+03,  -1.4992527030548747E+03},
-          {-7.9857427421129714E+01, -4.0585588534807385E+02, 2.6054813773472697E+03,  -6.1806593581075495E+03, 8.0679596874001718E+03,  -6.1806593581869265E+03, 2.6054813773147021E+03,  -4.0585588535363172E+02, -7.9857427421126204E+01},
-          {-7.1572272057937070E+01, 2.2785637019511205E+02,  -3.9109820765665262E+02, 3.3597424711470910E+02,  1.0596763818009852E-07,  -3.3597424723359080E+02, 3.9109820766854079E+02,  -2.2785637019009673E+02, 7.1572272057939983E+01},
-          {-9.8886360698074700E+00, 3.5359026949867051E+01,  -8.5251867715709949E+01, 1.4285748012617628E+02,  -1.6935269668779691E+02, 1.4285748010331625E+02,  -8.5251867711661305E+01, 3.5359026944299828E+01,  -9.8886360698207305E+00}
-      }};
-    } else if constexpr(w==10) {
-      return std::array<std::array<T, w>, nc> {{
-          {2.2594586605749264E+04,  1.3595989066786593E+07,  4.4723032442444897E+08,  3.3781755837397518E+09,  8.6836783895849819E+09,  8.6836783895849762E+09,  3.3781755837397494E+09,  4.4723032442444897E+08,  1.3595989066786474E+07,  2.2594586605749344E+04,},
-          {1.0729981697645642E+05,  3.0651490267742988E+07,  5.9387966085130465E+08,  2.4434902657508330E+09,  2.0073077861288922E+09,  -2.0073077861288943E+09, -2.4434902657508330E+09, -5.9387966085130453E+08, -3.0651490267742816E+07, -1.0729981697645638E+05,},
-          {2.2340399734184606E+05,  3.0258214643190462E+07,  3.1512411458738232E+08,  4.3618276932319808E+08,  -7.8178848450497293E+08, -7.8178848450497019E+08, 4.3618276932319826E+08,  3.1512411458738232E+08,  3.0258214643190313E+07,  2.2340399734184548E+05,},
-          {2.6917433004353486E+05,  1.6875651476661228E+07,  7.4664745481963441E+07,  -9.5882157211118385E+07, -2.0622994435532519E+08, 2.0622994435532743E+08,  9.5882157211118177E+07,  -7.4664745481963515E+07, -1.6875651476661161E+07, -2.6917433004353428E+05,},
-          {2.0818422772177903E+05,  5.6084730690362519E+06,  1.4435118192351763E+06,  -4.0063869969544649E+07, 3.2803674392747045E+07,  3.2803674392746095E+07,  -4.0063869969546899E+07, 1.4435118192351642E+06,  5.6084730690362034E+06,  2.0818422772177853E+05,},
-          {1.0781139496011091E+05,  9.9202615851199068E+05,  -3.3266265543962116E+06, -4.8557049011479173E+05, 1.0176155522772279E+07,  -1.0176155522772269E+07, 4.8557049011678610E+05,  3.3266265543963453E+06,  -9.9202615851196018E+05, -1.0781139496011072E+05,},
-          {3.7380102688153558E+04,  1.2716675000355666E+04,  -6.2163527451774501E+05, 1.4157962667184104E+06,  -8.4419693137680157E+05, -8.4419693137743860E+05, 1.4157962667189445E+06,  -6.2163527451771160E+05, 1.2716675000340010E+04,  3.7380102688153442E+04,},
-          {8.1238936393894646E+03,  -3.4872365530450072E+04, 2.3913680325196314E+04,  1.2428850301830019E+05,  -3.2158255329716846E+05, 3.2158255329951923E+05,  -1.2428850301867779E+05, -2.3913680325277423E+04, 3.4872365530457188E+04,  -8.1238936393894255E+03,},
-          {7.8515926628982663E+02,  -6.6607899119372642E+03, 2.0167398338513311E+04,  -2.8951401344519112E+04, 1.4622828142848679E+04,  1.4622828143544031E+04,  -2.8951401346900999E+04, 2.0167398338398041E+04,  -6.6607899119505255E+03, 7.8515926628967964E+02,},
-          {-1.0147176570537010E+02, -3.5304284185385157E+01, 1.3576976854876134E+03,  -4.3921059353471856E+03, 7.3232085271125388E+03,  -7.3232085273978546E+03, 4.3921059367737662E+03,  -1.3576976854043962E+03, 3.5304284185385157E+01,  1.0147176570550941E+02,},
-          {-4.3161545259389186E+01, 1.5498490981579428E+02,  -3.1771250774232175E+02, 3.7215448796427023E+02,  -1.7181762832770994E+02, -1.7181763036843782E+02, 3.7215448789408123E+02,  -3.1771250773692140E+02, 1.5498490982186786E+02,  -4.3161545259547800E+01,},
-          {-4.2916172038214198E+00, 1.7402146071148604E+01,  -4.7947588069135868E+01, 9.2697698088029625E+01,  -1.2821427596894478E+02, 1.2821427705670308E+02,  -9.2697698297776569E+01, 4.7947588093524907E+01,  -1.7402146074502035E+01, 4.2916172038452141E+00,}
-      }};
-    } else if constexpr(w==11) {
-      return std::array<std::array<T, w>, nc> {{
-          {3.7794653219809625E+04,  3.4782300224660739E+07,  1.6188020733727551E+09,  1.7196758809615005E+10,  6.3754384857724617E+10,  9.7196447559193497E+10,  6.3754384857724617E+10,  1.7196758809614998E+10,  1.6188020733727560E+09,  3.4782300224660769E+07,  3.7794653219808984E+04,},
-          {1.8969206922085886E+05,  8.4769319065313652E+07,  2.4230555767723408E+09,  1.5439732722639101E+10,  2.7112836839612309E+10,  2.5609833368650835E-06,  -2.7112836839612328E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05,},
-          {4.2138380313901440E+05,  9.2050522922791913E+07,  1.5259983101266613E+09,  4.7070559561237173E+09,  -1.2448027572952359E+09, -1.0161446790279301E+10, -1.2448027572952316E+09, 4.7070559561237268E+09,  1.5259983101266615E+09,  9.2050522922791913E+07,  4.2138380313901149E+05,},
-          {5.4814313598122005E+05,  5.8085130777589552E+07,  4.9484006166551048E+08,  1.6222124676640952E+08,  -2.0440440381345339E+09, 9.1416457449079640E-06,  2.0440440381345336E+09,  -1.6222124676640788E+08, -4.9484006166551071E+08, -5.8085130777589560E+07, -5.4814313598121714E+05,},
-          {4.6495183529254980E+05,  2.3067199578027144E+07,  6.9832590192482382E+07,  -2.2024799260683522E+08, -1.2820270942588677E+08, 5.1017181199129778E+08,  -1.2820270942588474E+08, -2.2024799260683942E+08, 6.9832590192482322E+07,  2.3067199578027155E+07,  4.6495183529254742E+05,},
-          {2.7021781043532980E+05,  5.6764510325100143E+06,  -5.5650761736748898E+06, -3.9907385617900200E+07, 7.2453390663687646E+07,  1.2300109686762266E-05,  -7.2453390663684472E+07, 3.9907385617899075E+07,  5.5650761736749066E+06,  -5.6764510325099993E+06, -2.7021781043532846E+05,},
-          {1.0933249308680627E+05,  6.9586821127987828E+05,  -3.6860240321937902E+06, 2.7428169457736355E+06,  8.3392008440593518E+06,  -1.6402201025046850E+07, 8.3392008440698013E+06,  2.7428169457778852E+06,  -3.6860240321937371E+06, 6.9586821127989423E+05,  1.0933249308680571E+05,},
-          {3.0203516161820498E+04,  -3.6879059542768438E+04, -4.1141031216788280E+05, 1.4111389975267777E+06,  -1.5914376635331670E+06, 9.4095582602103753E-06,  1.5914376635379130E+06,  -1.4111389975247320E+06, 4.1141031216776522E+05,  3.6879059542750314E+04,  -3.0203516161820549E+04,},
-          {5.1670143574922731E+03,  -2.8613147115372190E+04, 4.3560195427081359E+04,  4.8438679582765450E+04,  -2.5856630639231802E+05, 3.7994883866738499E+05,  -2.5856630640319458E+05, 4.8438679579510936E+04,  4.3560195426766244E+04,  -2.8613147115376054E+04, 5.1670143574922913E+03,},
-          {3.0888018539740131E+02,  -3.7949446187471626E+03, 1.4313303204988082E+04,  -2.6681600235594462E+04, 2.3856005166166615E+04,  8.6424601730164351E-06,  -2.3856005155895236E+04, 2.6681600234453199E+04,  -1.4313303205083188E+04, 3.7949446187583080E+03,  -3.0888018539728523E+02,},
-          {-8.3747489794189363E+01, 1.1948077479405792E+02,  4.8528498015072080E+02,  -2.5024391114755094E+03, 5.3511195318669425E+03,  -6.7655484107390166E+03, 5.3511195362291774E+03,  -2.5024391131167667E+03, 4.8528498019392708E+02,  1.1948077480620087E+02,  -8.3747489794426258E+01,},
-          {-2.2640047135517630E+01, 9.0840898563949466E+01,  -2.1597187544386938E+02, 3.1511229111443720E+02,  -2.4856617998395282E+02, 6.1683918215190516E-06,  2.4856618439352349E+02,  -3.1511228757800421E+02, 2.1597187557069353E+02,  -9.0840898570046704E+01, 2.2640047135565219E+01,},
-          {-1.6306382886201207E+00, 7.3325946591320434E+00,  -2.3241017682854558E+01, 5.1715494398901185E+01,  -8.2673000279130790E+01, 9.6489719151212370E+01,  -8.2673010381149226E+01, 5.1715494328769353E+01,  -2.3241018024860580E+01, 7.3325946448852415E+00,  -1.6306382886460551E+00,}
-      }};
-    } else if constexpr(w==12) {
-      return std::array<std::array<T, w>, nc> {{
-          {6.1722991679852908E+04,  8.4789650417103648E+07,  5.4431675199498701E+09,  7.8788892335272232E+10,  4.0355760945670044E+11,  8.8071481911347949E+11,  8.8071481911347961E+11,  4.0355760945670044E+11,  7.8788892335272430E+10,  5.4431675199498835E+09,  8.4789650417103708E+07,  6.1722991679871957E+04},
-          {3.2561466099406168E+05,  2.2112758120210618E+08,  8.9911609880089817E+09,  8.3059508064200943E+10,  2.3965569143469864E+11,  1.6939286803305212E+11,  -1.6939286803305203E+11, -2.3965569143469864E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210618E+08, -3.2561466099404311E+05},
-          {7.6621098001581512E+05,  2.6026568260310286E+08,  6.4524338253008652E+09,  3.3729904113826820E+10,  2.8555202212474091E+10,  -6.8998572040731537E+10, -6.8998572040731445E+10, 2.8555202212474079E+10,  3.3729904113826824E+10,  6.4524338253008757E+09,  2.6026568260310274E+08,  7.6621098001583829E+05},
-          {1.0657807616803218E+06,  1.8144472126890984E+08,  2.5524827004349842E+09,  5.2112383911371660E+09,  -1.0268350564014645E+10, -1.4763245309081306E+10, 1.4763245309081314E+10,  1.0268350564014671E+10,  -5.2112383911371059E+09, -2.5524827004349871E+09, -1.8144472126890984E+08, -1.0657807616803099E+06},
-          {9.7829638830158755E+05,  8.2222351241519913E+07,  5.5676911894064474E+08,  -4.8739037675427330E+08, -2.7153428193078227E+09, 2.5627633609246106E+09,  2.5627633609246163E+09,  -2.7153428193078651E+09, -4.8739037675430620E+08, 5.5676911894064546E+08,  8.2222351241519868E+07,  9.7829638830161188E+05},
-          {6.2536876825114002E+05,  2.4702814073680203E+07,  4.1488431554846466E+07,  -2.9274790542418826E+08, 1.0742154109191516E+08,  6.2185168968032193E+08,  -6.2185168968012476E+08, -1.0742154109184742E+08, 2.9274790542423087E+08,  -4.1488431554843128E+07, -2.4702814073680237E+07, -6.2536876825112454E+05},
-          {2.8527714307528478E+05,  4.6266378435690766E+06,  -1.0665598090790771E+07, -2.6048960239891130E+07, 9.1597254427317813E+07,  -5.9794495983264342E+07, -5.9794495983220413E+07, 9.1597254427343085E+07,  -2.6048960239921503E+07, -1.0665598090794146E+07, 4.6266378435690673E+06,  2.8527714307530399E+05},
-          {9.2873647411234080E+04,  3.6630046787425119E+05,  -3.1271047224730137E+06, 4.8612412939252760E+06,  3.3820440907796426E+06,  -1.6880127953704204E+07, 1.6880127953756198E+07,  -3.3820440907614031E+06, -4.8612412938993908E+06, 3.1271047224752530E+06,  -3.6630046787425695E+05, -9.2873647411217215E+04},
-          {2.0817947751046438E+04,  -5.5660303410315042E+04, -1.9519783923444615E+05, 1.0804817251338551E+06,  -1.8264985852555393E+06, 9.7602844968061335E+05,  9.7602844962902542E+05,  -1.8264985852963410E+06, 1.0804817251124913E+06,  -1.9519783923503032E+05, -5.5660303410363231E+04, 2.0817947751063632E+04},
-          {2.7986023314783361E+03,  -1.9404411093655592E+04, 4.3922625000519314E+04,  -7.6450317451901383E+03, -1.5273911974273989E+05, 3.3223441458516393E+05,  -3.3223441441930021E+05, 1.5273911979752057E+05,  7.6450317512768806E+03,  -4.3922624998141677E+04, 1.9404411093637758E+04,  -2.7986023314644049E+03},
-          {6.7849020474048089E+01,  -1.7921351308204744E+03, 8.4980694686552797E+03,  -1.9742624859769410E+04, 2.4620674845030797E+04,  -1.1676544851227827E+04, -1.1676544869194569E+04, 2.4620674845030626E+04,  -1.9742624831436660E+04, 8.4980694630406069E+03,  -1.7921351308312935E+03, 6.7849020488592075E+01},
-          {-5.4577020998836872E+01, 1.3637112867242237E+02,  4.5513616580246023E+01,  -1.1174001367986359E+03, 3.2018769312434206E+03,  -5.0580351396215219E+03, 5.0580351683422405E+03,  -3.2018769242193171E+03, 1.1174000998831286E+03,  -4.5513609243969356E+01, -1.3637112867730119E+02, 5.4577021011726984E+01},
-          {-1.0538365872268786E+01, 4.6577222488645518E+01,  -1.2606964198473415E+02, 2.1881091668968099E+02,  -2.3273399614976032E+02, 1.0274275204276027E+02,  1.0274270265494516E+02,  -2.3273401859852868E+02, 2.1881091865396468E+02,  -1.2606964777237258E+02, 4.6577222453584369E+01,  -1.0538365860573146E+01},
-          {-4.6087004144309118E-01, 2.5969759128998060E+00,  -9.6946932216381381E+00, 2.4990041962121211E+01,  -4.6013909139329137E+01, 6.2056985032913090E+01,  -6.2056925855365186E+01, 4.6013921000662158E+01,  -2.4990037445376750E+01, 9.6946954085586885E+00,  -2.5969759201692755E+00, 4.6087004744129911E-01}
-      }};
-    } else if constexpr(w==13) {
-      return std::array<std::array<T, w>, nc> {{
-          {9.8715725867495363E+04,  1.9828875496808097E+08,  1.7196758809614983E+10,  3.3083776881353577E+11,  2.2668873993375439E+12,  6.7734720591167568E+12,  9.6695220682534785E+12,  6.7734720591167432E+12,  2.2668873993375430E+12,  3.3083776881353503E+11,  1.7196758809614998E+10,  1.9828875496807891E+08,  9.8715725867496090E+04},
-          {5.4491110456935549E+05,  5.4903670125539351E+08,  3.0879465445278183E+10,  3.9588436413399969E+11,  1.6860562536749778E+12,  2.4256447893117891E+12,  -5.5583944938791784E-05, -2.4256447893117847E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538898E+08, -5.4491110456935526E+05},
-          {1.3504711883426071E+06,  6.9286979077463162E+08,  2.4618123595484577E+10,  1.9493985627722607E+11,  3.9422703517046350E+11,  -1.8678883613919861E+11, -8.5538079834550110E+11, -1.8678883613919730E+11, 3.9422703517046375E+11,  1.9493985627722589E+11,  2.4618123595484566E+10,  6.9286979077462614E+08,  1.3504711883426069E+06},
-          {1.9937206140846491E+06,  5.2512029493765980E+08,  1.1253303793811750E+10,  4.6205527735932152E+10,  -1.1607472377983305E+10, -1.6305241755642313E+11, 3.5385440504350348E-04,  1.6305241755642365E+11,  1.1607472377982582E+10,  -4.6205527735932213E+10, -1.1253303793811750E+10, -5.2512029493765628E+08, -1.9937206140846489E+06},
-          {1.9607419630386413E+06,  2.6425362558103892E+08,  3.1171259341747193E+09,  2.9839860297839913E+09,  -1.9585031917561897E+10, -5.0666917387065792E+09, 3.6568794485480583E+10,  -5.0666917387057562E+09, -1.9585031917561817E+10, 2.9839860297838497E+09,  3.1171259341747184E+09,  2.6425362558103728E+08,  1.9607419630386417E+06},
-          {1.3593773865640305E+06,  9.1556445104158267E+07,  4.7074012944133747E+08,  -1.1192579335657008E+09, -2.1090780087868555E+09, 5.2270306737951984E+09,  5.6467240041521856E-04,  -5.2270306737934217E+09, 2.1090780087880819E+09,  1.1192579335658383E+09,  -4.7074012944133127E+08, -9.1556445104157984E+07, -1.3593773865640305E+06},
-          {6.8417206432039209E+05,  2.1561705510027152E+07,  7.5785249893055111E+06,  -2.7456096030221754E+08, 3.4589095671054310E+08,  4.0256106808894646E+08,  -1.0074306926603404E+09, 4.0256106809081393E+08,  3.4589095670997137E+08,  -2.7456096030236483E+08, 7.5785249893030487E+06,  2.1561705510027405E+07,  6.8417206432039209E+05},
-          {2.5248269397037517E+05,  3.0985559672616189E+06,  -1.1816517087616559E+07, -8.2958498770184973E+06, 8.0546642347355247E+07,  -1.0594657799485898E+08, 2.1816722293163801E-04,  1.0594657799424352E+08,  -8.0546642347497791E+07, 8.2958498771036500E+06,  1.1816517087615721E+07,  -3.0985559672621777E+06, -2.5248269397037517E+05},
-          {6.7530100970876694E+04,  1.2373362326658823E+05,  -2.1245597183281910E+06, 5.1047323238754412E+06,  -1.4139444405488928E+06, -1.1818267555096827E+07, 2.0121548578624789E+07,  -1.1818267557079868E+07, -1.4139444401348191E+06, 5.1047323236516044E+06,  -2.1245597183309775E+06, 1.2373362326702787E+05,  6.7530100970876316E+04},
-          {1.2421368748961073E+04,  -5.0576243647011936E+04, -4.8878193436902722E+04, 6.5307896872028301E+05,  -1.5497610127060430E+06, 1.5137725917321201E+06,  4.1615986404011299E-04,  -1.5137725918538549E+06, 1.5497610130469005E+06,  -6.5307896856811445E+05, 4.8878193438804832E+04,  5.0576243646433126E+04,  -1.2421368748961073E+04},
-          {1.2904654687550299E+03,  -1.1169946055009055E+04, 3.3275109713863385E+04,  -3.1765222274236821E+04, -5.9810982085323274E+04, 2.2355863038592847E+05,  -3.1083591705219547E+05, 2.2355863445202672E+05,  -5.9810982721084511E+04, -3.1765222464963932E+04, 3.3275109714208855E+04,  -1.1169946054555618E+04, 1.2904654687545376E+03},
-          {-1.9043622268674213E+01, -6.8296542209516542E+02, 4.2702512274202591E+03,  -1.2165497317825058E+04, 1.9423733298269544E+04,  -1.6010024066956401E+04, 3.4018642874429026E-04,  1.6010021599471667E+04,  -1.9423732817821805E+04, 1.2165497483905752E+04,  -4.2702512286689680E+03, 6.8296542153908558E+02,  1.9043622268312891E+01},
-          {-3.0093984465361217E+01, 9.8972865724808671E+01,  -9.7437038666761538E+01, -3.5079928405373198E+02, 1.5699250566648977E+03,  -3.1287439837941820E+03, 3.8692196309709061E+03,  -3.1287462825615335E+03, 1.5699252631958864E+03,  -3.5079944793112952E+02, -9.7437041893750632E+01, 9.8972866189610414E+01,  -3.0093984465884773E+01},
-          {-4.3050286009489040E+00, 2.1108975724659501E+01,  -6.4297198812570272E+01, 1.2922884632277874E+02,  -1.6991812716212596E+02, 1.2655005901719436E+02,  9.2483537895948854E-05,  -1.2655066232531748E+02, 1.6991805207569072E+02,  -1.2922893667436634E+02, 6.4297198424711908E+01,  -2.1108976207523057E+01, 4.3050286009485790E+00},
-          {-1.0957333716725008E-01, 7.2949317004436565E-01,  -3.4300816058693728E+00, 1.0470054474579324E+01,  -2.2292134950656113E+01, 3.4570827323582719E+01,  -3.9923523442753932E+01, 3.4573264959502886E+01,  -2.2292358612963266E+01, 1.0470042004916014E+01,  -3.4300810538570281E+00, 7.2949352113279253E-01,  -1.0957333740315604E-01}
-      }};
-    } else if constexpr(w==14) {
-      return std::array<std::array<T, w>, nc> {{
-          {1.5499533202966207E+05,  4.4723032442444688E+08,  5.1495083701694740E+10,  1.2904576022918071E+12,  1.1534950432785506E+13,  4.5650102198520484E+13,  8.8830582190032641E+13,  8.8830582190032641E+13,  4.5650102198520492E+13,  1.1534950432785527E+13,  1.2904576022918074E+12,  5.1495083701695107E+10,  4.4723032442444855E+08,  1.5499533202970232E+05},
-          {8.9188339002980455E+05,  1.3065352538728635E+09,  9.9400185225815567E+10,  1.7136059013402405E+12,  1.0144146621675832E+13,  2.3034036018490715E+13,  1.4630967270448871E+13,  -1.4630967270448855E+13, -2.3034036018490719E+13, -1.0144146621675846E+13, -1.7136059013402405E+12, -9.9400185225815964E+10, -1.3065352538728662E+09, -8.9188339002979454E+05},
-          {2.3170473769379663E+06,  1.7532505043698256E+09,  8.6523535958354309E+10,  9.7455289065487354E+11,  3.2977972139362314E+12,  1.7874626001697781E+12,  -6.1480918082633916E+12, -6.1480918082633975E+12, 1.7874626001697690E+12,  3.2977972139362285E+12,  9.7455289065487329E+11,  8.6523535958354630E+10,  1.7532505043698275E+09,  2.3170473769380399E+06},
-          {3.6089249230396422E+06,  1.4278058213962190E+09,  4.4296625537022423E+10,  2.9466624630419781E+11,  3.1903621584503235E+11,  -9.8834691411254565E+11, -1.1072264714919226E+12, 1.1072264714919316E+12,  9.8834691411255151E+11,  -3.1903621584503467E+11, -2.9466624630419769E+11, -4.4296625537022621E+10, -1.4278058213962219E+09, -3.6089249230396664E+06},
-          {3.7733555140851745E+06,  7.8376718099107409E+08,  1.4443117772349569E+10,  4.3197433307418671E+10,  -7.6585042240585556E+10, -1.8569640140763062E+11, 2.0385335192657199E+11,  2.0385335192656519E+11,  -1.8569640140762662E+11, -7.6585042240580856E+10, 4.3197433307418686E+10,  1.4443117772349669E+10,  7.8376718099107552E+08,  3.7733555140852560E+06},
-          {2.8079157920112358E+06,  3.0340753492383724E+08,  2.9498136661747241E+09,  -6.2820200387919831E+08, -2.2372008390623215E+10, 1.5217518660584890E+10,  4.0682590266891922E+10,  -4.0682590266869431E+10, -1.5217518660582748E+10, 2.2372008390625935E+10,  6.2820200387968791E+08,  -2.9498136661747637E+09, -3.0340753492383808E+08, -2.8079157920112377E+06},
-          {1.5361613559533111E+06,  8.3513615594416574E+07,  3.0077547202708024E+08,  -1.3749596754067802E+09, -6.6733027297557127E+08, 5.9590333632819109E+09,  -4.3025685566870070E+09, -4.3025685566872711E+09, 5.9590333632806673E+09,  -6.6733027297523963E+08, -1.3749596754067125E+09, 3.0077547202709383E+08,  8.3513615594416171E+07,  1.5361613559533576E+06},
-          {6.2759409419592959E+05,  1.5741723594963098E+07,  -1.5632610223406436E+07, -1.9294824907078514E+08, 4.4643806532434595E+08,  1.5178998385244830E+07,  -9.6771139891725647E+08, 9.6771139892509627E+08,  -1.5178998381042883E+07, -4.4643806533176166E+08, 1.9294824907065383E+08,  1.5632610223392555E+07,  -1.5741723594963137E+07, -6.2759409419590747E+05},
-          {1.9151404903933613E+05,  1.7156606891563335E+06,  -9.7733523156688716E+06, 4.2982266233154163E+06,  5.1660907884347722E+07,  -1.1279400211155911E+08, 6.4701089573962681E+07,  6.4701089571562663E+07,  -1.1279400211012064E+08, 5.1660907891220264E+07,  4.2982266233826512E+06,  -9.7733523157112263E+06, 1.7156606891560503E+06,  1.9151404903936724E+05},
-          {4.2715272622845026E+04,  -2.2565910611953568E+03, -1.1769776156959014E+06, 4.0078399907813077E+06,  -3.8951858063335596E+06, -5.0944610754510267E+06, 1.6765992446914168E+07,  -1.6765992426657490E+07, 5.0944610781778870E+06,  3.8951858062361716E+06,  -4.0078399907326135E+06, 1.1769776157141617E+06,  2.2565910606306688E+03,  -4.2715272622820135E+04},
-          {6.4806786522793900E+03,  -3.5474227032974472E+04, 1.8237100709385861E+04,  3.0934714629696816E+05,  -1.0394703931686131E+06, 1.4743920333143482E+06,  -7.3356882447856572E+05, -7.3356882916658197E+05, 1.4743920305501707E+06,  -1.0394703929917105E+06, 3.0934714631908614E+05,  1.8237100665157792E+04,  -3.5474227033406372E+04, 6.4806786523010323E+03},
-          {4.9913632908459954E+02,  -5.5416668524952684E+03, 2.0614058717617296E+04,  -3.2285139072943130E+04, -5.3099550821623425E+03, 1.1559000502166932E+05,  -2.2569743259261423E+05, 2.2569743616896842E+05,  -1.1559000130545651E+05, 5.3099543129458480E+03,  3.2285139142872020E+04,  -2.0614058670790018E+04, 5.5416668533342381E+03,  -4.9913632906195977E+02},
-          {-3.3076333188134086E+01, -1.8970588563697331E+02, 1.8160423493164808E+03,  -6.3715703355644328E+03, 1.2525624574329036E+04,  -1.4199806452802783E+04, 6.4441892296909591E+03,  6.4441909537524216E+03,  -1.4199808176873401E+04, 1.2525626154733827E+04,  -6.3715704433222418E+03, 1.8160422729911850E+03,  -1.8970588700495102E+02, -3.3076333168231550E+01},
-          {-1.4394533627743886E+01, 5.7000699089242815E+01,  -1.0101142663923416E+02, -3.2954197414395189E+01, 6.1417879182394654E+02,  -1.6177283846697430E+03, 2.4593386157454975E+03,  -2.4593322941165261E+03, 1.6177291239900730E+03,  -6.1417952013923764E+02, 3.2954100943010943E+01,  1.0101142710333265E+02,  -5.7000699100179844E+01, 1.4394533639240331E+01},
-          {-1.5925952284027161E+00, 8.5113930215357829E+00,  -2.8993523187012922E+01, 6.6373454994590404E+01,  -1.0329574518449559E+02, 1.0280184257681817E+02,  -4.3896094875192006E+01, -4.3899302208087086E+01, 1.0280039795628096E+02,  -1.0329511291885207E+02, 6.6373435700858948E+01,  -2.8993536490606409E+01, 8.5113924808491728E+00,  -1.5925952194145006E+00},
-          {1.5984868520881029E-02,  1.2876175212962959E-01,  -9.8358742969175483E-01, 3.7711523389360830E+00,  -9.4305498095765508E+00, 1.6842854581416674E+01,  -2.2308566502972713E+01, 2.2308940200151390E+01,  -1.6841512668820517E+01, 9.4313524091989347E+00,  -3.7710716543179599E+00, 9.8361025494556609E-01,  -1.2876100566420701E-01, -1.5984859433053292E-02}
-      }};
-    } else if constexpr(w==15) {
-      return std::array<std::array<T, w>, nc> {{
-          {2.3939707792241839E+05,  9.7700272582690191E+08,  1.4715933396485257E+11,  4.7242424833337158E+12,  5.3987426629953594E+13,  2.7580474290566078E+14,  7.0693378336533400E+14,  9.6196578554477775E+14,  7.0693378336533400E+14,  2.7580474290566125E+14,  5.3987426629953766E+13,  4.7242424833337246E+12,  1.4715933396485263E+11,  9.7700272582690215E+08,  2.3939707792242285E+05},
-          {1.4314487885226035E+06,  2.9961416925358453E+09,  3.0273361232748438E+11,  6.8507333793903584E+12,  5.4192702756911000E+13,  1.7551587948105309E+14,  2.1874615668430150E+14,  3.4316191014053393E-02,  -2.1874615668430150E+14, -1.7551587948105334E+14, -5.4192702756911180E+13, -6.8507333793903701E+12, -3.0273361232748438E+11, -2.9961416925358458E+09, -1.4314487885226049E+06},
-          {3.8829497354762917E+06,  4.2473082696966448E+09,  2.8414312556015540E+11,  4.3688281331121411E+12,  2.1823119508000543E+13,  3.2228098609392094E+13,  -2.1833085454691789E+13, -7.3750710225100812E+13, -2.1833085454691820E+13, 3.2228098609392055E+13,  2.1823119508000594E+13,  4.3688281331121479E+12,  2.8414312556015527E+11,  4.2473082696966434E+09,  3.8829497354762889E+06},
-          {6.3495763451755755E+06,  3.6841035003733950E+09,  1.5965774278321045E+11,  1.5630338683778201E+12,  3.8749058615819268E+12,  -2.7319740087723574E+12, -1.3233342822865402E+13, 6.1642230420317079E-02,  1.3233342822865449E+13,  2.7319740087723975E+12,  -3.8749058615819365E+12, -1.5630338683778203E+12, -1.5965774278321042E+11, -3.6841035003733935E+09, -6.3495763451755764E+06},
-          {7.0146619045520434E+06,  2.1782897863065763E+09,  5.8897780310148087E+10,  3.1953009601770325E+11,  4.0651527029737198E+08,  -1.6379148273276064E+12, -1.1568753137013029E+11, 2.7451653250460508E+12,  -1.1568753137012485E+11, -1.6379148273277261E+12, 4.0651527029819238E+08,  3.1953009601770361E+11,  5.8897780310148087E+10,  2.1782897863065763E+09,  7.0146619045520443E+06},
-          {5.5580012413990172E+06,  9.2345162185944164E+08,  1.4522950934020109E+10,  2.7025952371212009E+10,  -1.2304576967641914E+11, -1.0116752717202786E+11, 3.8517418245458325E+11,  1.0918347404432817E-01,  -3.8517418245444312E+11, 1.0116752717221135E+11,  1.2304576967643665E+11,  -2.7025952371214943E+10, -1.4522950934020079E+10, -9.2345162185944211E+08, -5.5580012413990181E+06},
-          {3.2693972344231778E+06,  2.8610260147425205E+08,  2.2348528403750563E+09,  -3.4574515574242272E+09, -1.7480626463583939E+10, 3.1608597465540653E+10,  1.9879262560072273E+10,  -6.6148013553772224E+10, 1.9879262560085339E+10,  3.1608597465515747E+10,  -1.7480626463576942E+10, -3.4574515574198236E+09, 2.2348528403750110E+09,  2.8610260147425193E+08,  3.2693972344231787E+06},
-          {1.4553539959296256E+06,  6.4136842048384041E+07,  1.3622336582062906E+08,  -1.2131510424644001E+09, 6.4322366984221375E+08,  4.5078753872047586E+09,  -7.1689413746930647E+09, 3.2906916833662987E-02,  7.1689413746724453E+09,  -4.5078753875009747E+09, -6.4322366985365331E+08, 1.2131510424608817E+09,  -1.3622336582067037E+08, -6.4136842048384242E+07, -1.4553539959296256E+06},
-          {4.9358776531681651E+05,  9.7772970960585065E+06,  -2.3511574237987626E+07, -1.0142613816641946E+08, 3.9421144218035364E+08,  -2.8449115593052310E+08, -5.7549243243741119E+08, 1.1608781631182449E+09,  -5.7549243240763104E+08, -2.8449115600447333E+08, 3.9421144214381480E+08,  -1.0142613816429654E+08, -2.3511574237995699E+07, 9.7772970960588697E+06,  4.9358776531681546E+05},
-          {1.2660319987326677E+05,  7.7519511328119377E+05,  -6.5244610661450895E+06, 9.0878257488052379E+06,  2.3116605621149920E+07,  -8.7079594462079599E+07, 9.5542733739275128E+07,  6.0548970733798724E-02,  -9.5542733661364838E+07, 8.7079594608550951E+07,  -2.3116605559600785E+07, -9.0878257522138134E+06, 6.5244610661298726E+06,  -7.7519511328133650E+05, -1.2660319987326639E+05},
-          {2.3793325531458529E+04,  -4.2305332803808597E+04, -5.2884156985535356E+05, 2.5307340127864038E+06,  -4.0404175271559842E+06, -1.7519992360184138E+05, 1.0146438805818636E+07,  -1.5828545480742473E+07, 1.0146438778928882E+07,  -1.7520004389869148E+05, -4.0404175770437294E+06, 2.5307340149977510E+06,  -5.2884156989405944E+05, -4.2305332803937294E+04, 2.3793325531459184E+04},
-          {2.9741655196834722E+03,  -2.0687056403786246E+04, 3.3295507799709936E+04,  1.0661145730323243E+05,  -5.6644238105382060E+05, 1.0874811616841732E+06,  -9.6561270266008016E+05, 1.5626594062671070E-02,  9.6561272951271443E+05,  -1.0874812528712249E+06, 5.6644243308078672E+05,  -1.0661145838213131E+05, -3.3295507812197495E+04, 2.0687056403630129E+04,  -2.9741655196846405E+03},
-          {1.5389176594899303E+02,  -2.3864418511494741E+03, 1.0846266954249364E+04,  -2.2940053396478714E+04, 1.4780106121058996E+04,  4.2663651769852157E+04,  -1.3047648013242516E+05, 1.7468401314164279E+05,  -1.3047645484607235E+05, 4.2663541429144650E+04,  1.4780036296018619E+04,  -2.2940053180976502E+04, 1.0846266927315819E+04,  -2.3864418517113058E+03, 1.5389176594779781E+02},
-          {-2.3857631312588978E+01, -1.9651606133609231E+01, 6.4183083829803820E+02,  -2.8648433109641578E+03, 6.8249243722518859E+03,  -9.7944325124827701E+03, 7.6177757600121276E+03,  1.8034307737205296E-02,  -7.6177559127722052E+03, 9.7944326623113047E+03,  -6.8249058342322496E+03, 2.8648407117981119E+03,  -6.4183085438795774E+02, 1.9651605969778377E+01,  2.3857631312809222E+01},
-          {-6.1348505739169541E+00, 2.7872915855267404E+01,  -6.5819942538871970E+01, 5.1366231962952028E+01,  1.7213955398158618E+02,  -6.9658621010000411E+02, 1.3192236112353403E+03,  -1.6054106225233884E+03, 1.3192031991952242E+03,  -6.9663961216547739E+02, 1.7211403815802629E+02,  5.1367579954366171E+01,  -6.5819957939661379E+01, 2.7872915947616441E+01,  -6.1348505735855374E+00},
-          {-4.9671584513490097E-01, 3.0617550953446115E+00,  -1.1650665638578070E+01, 3.0081586723089057E+01,  -5.4028356726202020E+01, 6.6077203078498044E+01,  -4.7145500171928198E+01, 4.2118837140985958E-03,  4.7167106663349848E+01,  -6.6048394423269173E+01, 5.4062906728994193E+01,  -3.0081603709324451E+01, 1.1650672008416343E+01,  -3.0617551285208524E+00, 4.9671584437353217E-01},
-          {4.3460786767313729E-03,  -1.3199600771767199E-02, -1.9412688562910244E-01, 1.1329433700669471E+00,  -3.4442045795063887E+00, 7.1737626956468912E+00,  -1.1098109271625262E+01, 1.2385772358881393E+01,  -1.1101471316239516E+01, 7.0913926025978853E+00,  -3.4845491148773502E+00, 1.1323523856621058E+00,  -1.9414904754428672E-01, -1.3200165079792004E-02, 4.3460782759443158E-03}
-      }};
-    } else if constexpr(w==16) {
-      return std::array<std::array<T, w>, nc> {{
-          {3.6434551345570839E+05,  2.0744705928579483E+09,  4.0355760945669995E+11,  1.6364575388763029E+13,  2.3514830376056538E+14,  1.5192201717462528E+15,  4.9956173084674090E+15,  8.9287666945127360E+15,  8.9287666945127390E+15,  4.9956173084674090E+15,  1.5192201717462528E+15,  2.3514830376056538E+14,  1.6364575388763035E+13,  4.0355760945670026E+11,  2.0744705928579524E+09,  3.6434551345571183E+05},
-          {2.2576246485480359E+06,  6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131066E+13, 2.6313738449330153E+14, 1.1495095100701460E+15, 2.1932582707747560E+15, 1.2860244365132595E+15, -1.2860244365132600E+15, -2.1932582707747578E+15,         -1.1495095100701465E+15, -2.6313738449330159E+14, -2.5606844387131062E+13, -8.7873753526056299E+11, -6.6499571180086451E+09, -2.2576246485480373E+06},
-          {6.3730995546265077E+06,  9.9060026035198078E+09,  8.8097248605449023E+11,  1.7953384130753688E+13,  1.2398425545001662E+14,  3.0749346493041262E+14,  1.0259777520247159E+14,  -5.5291976457534325E+14, -5.5291976457534325E+14, 1.0259777520247186E+14,  3.0749346493041219E+14,  1.2398425545001659E+14,  1.7953384130753676E+13,  8.8097248605448950E+11,  9.9060026035198040E+09,  6.3730995546265030E+06},
-          {1.0896915393078227E+07,  9.0890343524593849E+09,  5.3565169504010010E+11,  7.3004206720038701E+12,  2.9692333044160066E+13,  1.6051737468109549E+13,  -9.1273329108089906E+13, -8.5999306918502953E+13, 8.5999306918502422E+13,  9.1273329108089984E+13,  -1.6051737468109510E+13, -2.9692333044160082E+13, -7.3004206720038701E+12, -5.3565169504010022E+11, -9.0890343524593849E+09, -1.0896915393078227E+07},
-          {1.2655725616100594E+07,  5.7342804054544210E+09,  2.1822836608899570E+11,  1.8300700858999690E+12,  2.7770431049857676E+12,  -8.5034969223852568E+12, -1.2846668467423438E+13, 1.6519076896571838E+13,  1.6519076896572182E+13,  -1.2846668467423555E+13, -8.5034969223850703E+12, 2.7770431049857896E+12,  1.8300700858999678E+12,  2.1822836608899567E+11,  5.7342804054544210E+09,  1.2655725616100591E+07},
-          {1.0609303958036326E+07,  2.6255609052371716E+09,  6.1673589426039413E+10,  2.6044432099085333E+11,  -3.5431628074578204E+11, -1.6077602129636348E+12, 1.5534405614728977E+12,  2.8019935380857432E+12,  -2.8019935380841978E+12, -1.5534405614724106E+12, 1.6077602129635625E+12,  3.5431628074580896E+11,  -2.6044432099084848E+11, -6.1673589426039429E+10, -2.6255609052371716E+09, -1.0609303958036322E+07},
-          {6.6544809363384582E+06,  8.9490403680928326E+08,  1.1882638725190845E+10,  8.1552898137823076E+09,  -1.2575562817886868E+11, 2.7074695075907585E+10,  3.9453789461955023E+11,  -3.1679644857468066E+11, -3.1679644857392346E+11, 3.9453789461966650E+11,  2.7074695075992649E+10,  -1.2575562817884555E+11, 8.1552898137788668E+09,  1.1882638725190889E+10,  8.9490403680928278E+08,  6.6544809363384554E+06},
-          {3.1906872142825006E+06,  2.2785946180651775E+08,  1.3744578972809248E+09,  -4.3997172592883167E+09, -9.2011130754043922E+09, 3.4690551711832901E+10,  -9.4227043395047741E+09, -5.9308465070198639E+10, 5.9308465069336540E+10,  9.4227043396350136E+09,  -3.4690551711738396E+10, 9.2011130753567543E+09,  4.3997172592879610E+09,  -1.3744578972813025E+09, -2.2785946180651844E+08, -3.1906872142825015E+06},
-          {1.1821527096621769E+06,  4.2281234059839502E+07,  2.8723226058712766E+07,  -8.3553955857628822E+08, 1.2447304828823066E+09,  2.1955280943585949E+09,  -7.0514195726908512E+09, 4.3745141239718714E+09,  4.3745141233600502E+09,  -7.0514195728029747E+09, 2.1955280943510208E+09,  1.2447304828590808E+09,  -8.3553955857879233E+08, 2.8723226058761366E+07,  4.2281234059838109E+07,  1.1821527096621762E+06},
-          {3.3854610744280310E+05,  5.2176984975081543E+06,  -2.0677283565079328E+07, -3.5831818968518838E+07, 2.6599346106412742E+08,  -3.7992777977357000E+08, -1.3426914417466179E+08, 9.1752051229224503E+08,  -9.1752051129499328E+08, 1.3426914497246322E+08,  3.7992777991069216E+08,  -2.6599346104854536E+08, 3.5831818968908392E+07,  2.0677283564896725E+07,  -5.2176984975075833E+06, -3.3854610744279937E+05},
-          {7.3893334077310064E+04,  2.6983804209559254E+05,  -3.6415998561101072E+06, 8.4025485849181097E+06,  4.9278860779345948E+06,  -5.1437033846752726E+07, 8.7603898676325440E+07,  -4.6199498412402093E+07, -4.6199498208604209E+07, 8.7603898435731798E+07,  -5.1437033863736227E+07, 4.9278861005789889E+06,  8.4025485831489991E+06,  -3.6415998560990733E+06, 2.6983804209473461E+05,  7.3893334077307401E+04},
-          {1.1778892113375481E+04,  -4.0077190108724200E+04, -1.8372552175909068E+05, 1.3262878399160223E+06,  -2.9738539927520575E+06, 1.9493509709529271E+06,  4.1881949951139782E+06,  -1.1066749616505133E+07, 1.1066749327519676E+07,  -4.1881946843906553E+06, -1.9493507810665092E+06, 2.9738539818831389E+06,  -1.3262878384774840E+06, 1.8372552162922107E+05,  4.0077190107319519E+04,  -1.1778892113376129E+04},
-          {1.2019749667923656E+03,  -1.0378455844500613E+04, 2.6333352653155256E+04,  1.7117060106301305E+04,  -2.5133287443653666E+05, 6.4713914262131555E+05,  -8.1634942572553246E+05, 3.8623935281825601E+05,  3.8623876433339820E+05,  -8.1634960962672008E+05, 6.4713900469564367E+05,  -2.5133289627502396E+05, 1.7117057951236206E+04,  2.6333352581335013E+04,  -1.0378455846609291E+04, 1.2019749667911419E+03},
-          {3.1189837632471693E+01,  -8.9083493807061564E+02, 4.9454293649337906E+03,  -1.3124693635095375E+04, 1.5834784331991095E+04,  6.9607870364081436E+03,  -5.9789871879430451E+04, 1.0841726514394575E+05,  -1.0841709685990328E+05, 5.9790206615067997E+04,  -6.9607049368128291E+03, -1.5834783935893831E+04, 1.3124692974990443E+04,  -4.9454295091588992E+03, 8.9083493794871868E+02,  -3.1189837631106176E+01},
-          {-1.2975319073401824E+01, 1.8283698218710011E+01,  1.7684015393859755E+02,  -1.1059917445033070E+03, 3.1998168298121523E+03,  -5.5988200120063057E+03, 5.9248751921324047E+03,  -2.5990022806343668E+03, -2.5990962125709430E+03, 5.9247537039895724E+03,  -5.5988835070734467E+03, 3.1998292349030621E+03,  -1.1059926481090836E+03, 1.7684013881079576E+02,  1.8283698123134819E+01,  -1.2975319073977776E+01},
-          {-2.3155118729954247E+00, 1.1938503634469159E+01,  -3.4150562973753665E+01, 4.8898615554511437E+01,  1.5853185548633874E+01,  -2.4272678107130790E+02, 6.0151276286907887E+02,  -8.8751856926690448E+02, 8.8742942550355474E+02,  -6.0136491467620624E+02, 2.4282489356694586E+02,  -1.5850195971204462E+01, -4.8897392545563044E+01, 3.4150562973753665E+01,  -1.1938504430698943E+01, 2.3155118723150525E+00},
-          {-1.5401723686076832E-01, 9.8067823888634464E-01,  -4.1900843552415639E+00, 1.2150534299778382E+01,  -2.4763139606227178E+01, 3.6068014621628578E+01,  -3.4346647779134791E+01, 1.3259903958585387E+01,  1.2937147675617604E+01,  -3.4454233206790519E+01, 3.6027670086257579E+01,  -2.4769863695455662E+01, 1.2149431128889342E+01,  -4.1901615115388706E+00, 9.8067695636810759E-01,  -1.5401723756214594E-01},
-          {1.1808835093099178E-02,  -2.5444299558662394E-02, -1.5661344238792723E-04, 2.5820071204205225E-01,  -1.0930950485268096E+00, 2.6408492552008669E+00,  -4.4415763059111955E+00, 6.8227366238712817E+00,  -6.8186662643534008E+00, 4.4887924763186051E+00,  -2.6327085361651021E+00, 1.0918739406714428E+00,  -2.5844238963842503E-01, 1.2680123888735934E-04,  2.5444206395526567E-02,  -1.1808834826225629E-02}
-      }};
-    } else {
-      static_assert(w >= 2, "w must be >= 2");
-      static_assert(w <= 16, "w must be <= 16");
-      return {};
-    }
+  if constexpr (w == 2) {
+    return std::array<std::array<T, w>, nc>{
+        {{4.5147043243215315E+01, 4.5147043243215300E+01},
+         {5.7408070938221300E+01, -5.7408070938221293E+01},
+         {-1.8395117920046484E+00, -1.8395117920046560E+00},
+         {-2.0382426253182082E+01, 2.0382426253182086E+01},
+         {-2.0940804433577420E+00, -2.0940804433577389E+00}}};
+  } else if constexpr (w == 3) {
+    return std::array<std::array<T, w>, nc>{
+        {{1.5653991189315119E+02, 8.8006872410780295E+02, 1.5653991189967152E+02},
+         {3.1653018869611077E+02, 7.4325702843759617E-14, -3.1653018868907071E+02},
+         {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117119E+02},
+         {-1.5357716116473156E+01, 9.5071486252033243E-15, 1.5357716122720193E+01},
+         {-3.7757583061523668E+01, 5.3222970968867315E+01, -3.7757583054647384E+01},
+         {-3.9654011076088804E+00, 1.8062124448285358E-13, 3.9654011139270540E+00}}};
+  } else if constexpr (w == 4) {
+    return std::array<std::array<T, w>, nc>{
+        {{5.4284366850213200E+02, 1.0073871433088398E+04, 1.0073871433088396E+04,
+          5.4284366850213223E+02},
+         {1.4650917259256939E+03, 6.1905285583602863E+03, -6.1905285583602881E+03,
+          -1.4650917259256937E+03},
+         {1.4186910680718345E+03, -1.3995339862725591E+03, -1.3995339862725598E+03,
+          1.4186910680718347E+03},
+         {5.1133995502497419E+02, -1.4191608683682996E+03, 1.4191608683682998E+03,
+          -5.1133995502497424E+02},
+         {-4.8293622641174039E+01, 3.9393732546135226E+01, 3.9393732546135816E+01,
+          -4.8293622641174061E+01},
+         {-7.8386867802392288E+01, 1.4918904800408930E+02, -1.4918904800408751E+02,
+          7.8386867802392359E+01},
+         {-1.0039212571700894E+01, 5.0626747735616746E+00, 5.0626747735625512E+00,
+          -1.0039212571700640E+01}}};
+  } else if constexpr (w == 5) {
+    return std::array<std::array<T, w>, nc>{
+        {{9.9223677575398392E+02, 3.7794697666613320E+04, 9.8715771010760494E+04,
+          3.7794697666613283E+04, 9.9223677575398403E+02},
+         {3.0430174925083825E+03, 3.7938404259811403E+04, -1.1842989705877139E-11,
+          -3.7938404259811381E+04, -3.0430174925083829E+03},
+         {3.6092689177271222E+03, 7.7501368899498666E+03, -2.2704627332475000E+04,
+          7.7501368899498730E+03, 3.6092689177271218E+03},
+         {1.9990077310495396E+03, -3.8875294641277296E+03, 9.7116927320010791E-12,
+          3.8875294641277369E+03, -1.9990077310495412E+03},
+         {4.0071733590403869E+02, -1.5861137916762602E+03, 2.3839858699098645E+03,
+          -1.5861137916762643E+03, 4.0071733590403909E+02},
+         {-9.1301168206167262E+01, 1.2316471075214675E+02, 2.0698495299948402E-11,
+          -1.2316471075214508E+02, 9.1301168206167233E+01},
+         {-5.5339722671223846E+01, 1.1960590540261879E+02, -1.5249941358311668E+02,
+          1.1960590540262307E+02, -5.5339722671223605E+01},
+         {-3.3762488150353924E+00, 2.2839981872948751E+00, 7.1884725699454154E-12,
+          -2.2839981872943818E+00, 3.3762488150341459E+00}}};
+  } else if constexpr (w == 6) {
+    return std::array<std::array<T, w>, nc>{
+        {{2.0553833234911876E+03, 1.5499537739913128E+05, 8.1177907023291115E+05,
+          8.1177907023291173E+05, 1.5499537739913136E+05, 2.0553833235005691E+03},
+         {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917674E+05,
+          -3.1559612614917627E+05, -2.0581923258843317E+05, -7.1269776034341394E+03},
+         {1.0023404568475091E+04, 9.0916650498360192E+04, -1.0095927514054619E+05,
+          -1.0095927514054628E+05, 9.0916650498360177E+04, 1.0023404568484635E+04},
+         {7.2536109410387417E+03, 4.8347162752602981E+03, -5.0512736602018522E+04,
+          5.0512736602018478E+04, -4.8347162752603008E+03, -7.2536109410297540E+03},
+         {2.7021878300949752E+03, -7.8773465553972646E+03, 5.2105876478342780E+03,
+          5.2105876478343343E+03, -7.8773465553972710E+03, 2.7021878301048723E+03},
+         {3.2120291706547636E+02, -1.8229189469936762E+03, 3.7928113414429808E+03,
+          -3.7928113414427025E+03, 1.8229189469937312E+03, -3.2120291705638243E+02},
+         {-1.2051267090537374E+02, 2.2400507411399673E+02, -1.2506575852541796E+02,
+          -1.2506575852521925E+02, 2.2400507411398695E+02, -1.2051267089640181E+02},
+         {-4.5977202613350237E+01, 1.1536880606853076E+02, -1.7819720186493959E+02,
+          1.7819720186497622E+02, -1.1536880606854736E+02, 4.5977202622148909E+01},
+         {-1.5631081288842275E+00, 7.1037430591266115E-01, -6.9838401121429056E-02,
+          -6.9838401186476856E-02, 7.1037430589285400E-01, -1.5631081203754575E+00}}};
+  } else if constexpr (w == 7) {
+    return std::array<std::array<T, w>, nc>{
+        {{3.9948351830487481E+03, 5.4715865608590771E+05, 5.0196413492771760E+06,
+          9.8206709220713247E+06, 5.0196413492771825E+06, 5.4715865608590783E+05,
+          3.9948351830642519E+03},
+         {1.5290160332974696E+04, 8.7628248584320408E+05, 3.4421061790934438E+06,
+          -2.6908159596373561E-10, -3.4421061790934461E+06, -8.7628248584320408E+05,
+          -1.5290160332958067E+04},
+         {2.4458227486779251E+04, 5.3904618484139396E+05, 2.4315566181017534E+05,
+          -1.6133959371974322E+06, 2.4315566181017453E+05, 5.3904618484139396E+05,
+          2.4458227486795113E+04},
+         {2.1166189345881645E+04, 1.3382732160223130E+05, -3.3113450969689694E+05,
+          6.9013724510092140E-10, 3.3113450969689724E+05, -1.3382732160223136E+05,
+          -2.1166189345866893E+04},
+         {1.0542795672344864E+04, -7.0739172265098678E+03, -6.5563293056049893E+04,
+          1.2429734005960064E+05, -6.5563293056049602E+04, -7.0739172265098332E+03,
+          1.0542795672361213E+04},
+         {2.7903491906228419E+03, -1.0975382873973093E+04, 1.3656979541144799E+04,
+          7.7346408577822045E-10, -1.3656979541143772E+04, 1.0975382873973256E+04,
+          -2.7903491906078298E+03},
+         {1.6069721418053300E+02, -1.5518707872251393E+03, 4.3634273936642621E+03,
+          -5.9891976420595174E+03, 4.3634273936642730E+03, -1.5518707872251064E+03,
+          1.6069721419533221E+02},
+         {-1.2289277373867256E+02, 2.8583630927743314E+02, -2.8318194617327981E+02,
+          6.9043515551118249E-10, 2.8318194617392436E+02, -2.8583630927760140E+02,
+          1.2289277375319763E+02},
+         {-3.2270164914249058E+01, 9.1892112257581346E+01, -1.6710678096334209E+02,
+          2.0317049305432383E+02, -1.6710678096383771E+02, 9.1892112257416159E+01,
+          -3.2270164900224913E+01},
+         {-1.4761409685186277E-01, -9.1862771280377487E-01, 1.2845147741777752E+00,
+          5.6547359492808854E-10, -1.2845147728310689E+00, 9.1862771293147971E-01,
+          1.4761410890866353E-01}}};
+  } else if constexpr (w == 8) {
+    return std::array<std::array<T, w>, nc>{
+        {{7.3898000697447915E+03, 1.7297637497600035E+06, 2.5578341605285794E+07,
+          8.4789650417103335E+07, 8.4789650417103350E+07, 2.5578341605285816E+07,
+          1.7297637497600049E+06, 7.3898000697447915E+03},
+         {3.0719636811267599E+04, 3.1853145713323927E+06, 2.3797981861403696E+07,
+          2.4569731244678464E+07, -2.4569731244678471E+07, -2.3797981861403704E+07,
+          -3.1853145713323941E+06, -3.0719636811267606E+04},
+         {5.4488498478251728E+04, 2.4101183255475131E+06, 6.4554051283428287E+06,
+          -8.9200440393090546E+06, -8.9200440393090583E+06, 6.4554051283428324E+06,
+          2.4101183255475126E+06, 5.4488498478251728E+04},
+         {5.3926359802542116E+04, 9.0469037926849292E+05, -6.0897036277696118E+05,
+          -3.0743852105799988E+06, 3.0743852105800058E+06, 6.0897036277696711E+05,
+          -9.0469037926849339E+05, -5.3926359802542138E+04},
+         {3.2444118016247590E+04, 1.3079802224392134E+05, -5.8652889370129269E+05,
+          4.2333306008151924E+05, 4.2333306008152053E+05, -5.8652889370128722E+05,
+          1.3079802224392109E+05, 3.2444118016247590E+04},
+         {1.1864306345505294E+04, -2.2700360645707988E+04, -5.0713607251414309E+04,
+          1.8308704458211688E+05, -1.8308704458210632E+05, 5.0713607251413123E+04,
+          2.2700360645707628E+04, -1.1864306345505294E+04},
+         {2.2812256770903232E+03, -1.1569135767377773E+04, 2.0942387020798891E+04,
+          -1.1661592834945191E+04, -1.1661592834940149E+04, 2.0942387020801420E+04,
+          -1.1569135767377924E+04, 2.2812256770903286E+03},
+         {8.5503535636821422E+00, -9.7513976461238224E+02, 3.8242995179171526E+03,
+          -6.9201295567267280E+03, 6.9201295567248662E+03, -3.8242995179155446E+03,
+          9.7513976461209836E+02, -8.5503535637013552E+00},
+         {-1.0230637348345023E+02, 2.8246898554269114E+02, -3.8638201738139219E+02,
+          1.9106407993320320E+02, 1.9106407993289886E+02, -3.8638201738492717E+02,
+          2.8246898554219217E+02, -1.0230637348345138E+02},
+         {-1.9200143062947848E+01, 6.1692257626706223E+01, -1.2981109187842989E+02,
+          1.8681284210471688E+02, -1.8681284209654376E+02, 1.2981109187880142E+02,
+          -6.1692257626845532E+01, 1.9200143062947120E+01},
+         {3.7894993760177598E-01, -1.7334408836731494E+00, 2.5271184057877303E+00,
+          -1.2600963971824484E+00, -1.2600963917834651E+00, 2.5271184069685657E+00,
+          -1.7334408840526812E+00, 3.7894993760636758E-01}}};
+  } else if constexpr (w == 9) {
+    return std::array<std::array<T, w>, nc>{
+        {{1.3136365370186100E+04, 5.0196413492771806E+06, 1.1303327711722563E+08,
+          5.8225443924996686E+08, 9.7700272582690656E+08, 5.8225443924996758E+08,
+          1.1303327711722568E+08, 5.0196413492772207E+06, 1.3136365370186135E+04},
+         {5.8623313038274340E+04, 1.0326318537280345E+07, 1.2898448324824864E+08,
+          3.0522863709830385E+08, -3.9398045056223735E-08, -3.0522863709830391E+08,
+          -1.2898448324824864E+08, -1.0326318537280388E+07, -5.8623313038274347E+04},
+         {1.1335001341875963E+05, 9.0726133144784812E+06, 5.3501544534038112E+07,
+          -2.6789524644146336E+05, -1.2483923718899371E+08, -2.6789524644172983E+05,
+          5.3501544534038112E+07, 9.0726133144785129E+06, 1.1335001341875960E+05},
+         {1.2489113703229747E+05, 4.3035547171861930E+06, 6.3021978510598792E+06,
+          -2.6014941986659057E+07, 6.0417403157325170E-08, 2.6014941986659389E+07,
+          -6.3021978510598652E+06, -4.3035547171862079E+06, -1.2489113703229751E+05},
+         {8.6425493435991244E+04, 1.0891182836653308E+06, -2.0713033564200639E+06,
+          -2.8994941183506218E+06, 7.5905338661205899E+06, -2.8994941183505375E+06,
+          -2.0713033564200667E+06, 1.0891182836653353E+06, 8.6425493435991288E+04},
+         {3.8657354724013814E+04, 7.9936390113331305E+04, -7.0458265546791907E+05,
+          1.0151095605715880E+06, 1.2138090419648379E-07, -1.0151095605717725E+06,
+          7.0458265546794771E+05, -7.9936390113331567E+04, -3.8657354724013821E+04},
+         {1.0779131453134638E+04, -3.3466718311300596E+04, -1.3245366619006139E+04,
+          1.8238470515353698E+05, -2.9285656292977190E+05, 1.8238470515350526E+05,
+          -1.3245366619000662E+04, -3.3466718311299621E+04, 1.0779131453134616E+04},
+         {1.4992527030548456E+03, -9.7024371533891372E+03, 2.3216330734057381E+04,
+          -2.3465262819040818E+04, 5.3299736484284360E-08, 2.3465262819251962E+04,
+          -2.3216330734049119E+04, 9.7024371533890644E+03, -1.4992527030548747E+03},
+         {-7.9857427421129714E+01, -4.0585588534807385E+02, 2.6054813773472697E+03,
+          -6.1806593581075495E+03, 8.0679596874001718E+03, -6.1806593581869265E+03,
+          2.6054813773147021E+03, -4.0585588535363172E+02, -7.9857427421126204E+01},
+         {-7.1572272057937070E+01, 2.2785637019511205E+02, -3.9109820765665262E+02,
+          3.3597424711470910E+02, 1.0596763818009852E-07, -3.3597424723359080E+02,
+          3.9109820766854079E+02, -2.2785637019009673E+02, 7.1572272057939983E+01},
+         {-9.8886360698074700E+00, 3.5359026949867051E+01, -8.5251867715709949E+01,
+          1.4285748012617628E+02, -1.6935269668779691E+02, 1.4285748010331625E+02,
+          -8.5251867711661305E+01, 3.5359026944299828E+01, -9.8886360698207305E+00}}};
+  } else if constexpr (w == 10) {
+    return std::array<std::array<T, w>, nc>{{{
+                                                 2.2594586605749264E+04,
+                                                 1.3595989066786593E+07,
+                                                 4.4723032442444897E+08,
+                                                 3.3781755837397518E+09,
+                                                 8.6836783895849819E+09,
+                                                 8.6836783895849762E+09,
+                                                 3.3781755837397494E+09,
+                                                 4.4723032442444897E+08,
+                                                 1.3595989066786474E+07,
+                                                 2.2594586605749344E+04,
+                                             },
+                                             {
+                                                 1.0729981697645642E+05,
+                                                 3.0651490267742988E+07,
+                                                 5.9387966085130465E+08,
+                                                 2.4434902657508330E+09,
+                                                 2.0073077861288922E+09,
+                                                 -2.0073077861288943E+09,
+                                                 -2.4434902657508330E+09,
+                                                 -5.9387966085130453E+08,
+                                                 -3.0651490267742816E+07,
+                                                 -1.0729981697645638E+05,
+                                             },
+                                             {
+                                                 2.2340399734184606E+05,
+                                                 3.0258214643190462E+07,
+                                                 3.1512411458738232E+08,
+                                                 4.3618276932319808E+08,
+                                                 -7.8178848450497293E+08,
+                                                 -7.8178848450497019E+08,
+                                                 4.3618276932319826E+08,
+                                                 3.1512411458738232E+08,
+                                                 3.0258214643190313E+07,
+                                                 2.2340399734184548E+05,
+                                             },
+                                             {
+                                                 2.6917433004353486E+05,
+                                                 1.6875651476661228E+07,
+                                                 7.4664745481963441E+07,
+                                                 -9.5882157211118385E+07,
+                                                 -2.0622994435532519E+08,
+                                                 2.0622994435532743E+08,
+                                                 9.5882157211118177E+07,
+                                                 -7.4664745481963515E+07,
+                                                 -1.6875651476661161E+07,
+                                                 -2.6917433004353428E+05,
+                                             },
+                                             {
+                                                 2.0818422772177903E+05,
+                                                 5.6084730690362519E+06,
+                                                 1.4435118192351763E+06,
+                                                 -4.0063869969544649E+07,
+                                                 3.2803674392747045E+07,
+                                                 3.2803674392746095E+07,
+                                                 -4.0063869969546899E+07,
+                                                 1.4435118192351642E+06,
+                                                 5.6084730690362034E+06,
+                                                 2.0818422772177853E+05,
+                                             },
+                                             {
+                                                 1.0781139496011091E+05,
+                                                 9.9202615851199068E+05,
+                                                 -3.3266265543962116E+06,
+                                                 -4.8557049011479173E+05,
+                                                 1.0176155522772279E+07,
+                                                 -1.0176155522772269E+07,
+                                                 4.8557049011678610E+05,
+                                                 3.3266265543963453E+06,
+                                                 -9.9202615851196018E+05,
+                                                 -1.0781139496011072E+05,
+                                             },
+                                             {
+                                                 3.7380102688153558E+04,
+                                                 1.2716675000355666E+04,
+                                                 -6.2163527451774501E+05,
+                                                 1.4157962667184104E+06,
+                                                 -8.4419693137680157E+05,
+                                                 -8.4419693137743860E+05,
+                                                 1.4157962667189445E+06,
+                                                 -6.2163527451771160E+05,
+                                                 1.2716675000340010E+04,
+                                                 3.7380102688153442E+04,
+                                             },
+                                             {
+                                                 8.1238936393894646E+03,
+                                                 -3.4872365530450072E+04,
+                                                 2.3913680325196314E+04,
+                                                 1.2428850301830019E+05,
+                                                 -3.2158255329716846E+05,
+                                                 3.2158255329951923E+05,
+                                                 -1.2428850301867779E+05,
+                                                 -2.3913680325277423E+04,
+                                                 3.4872365530457188E+04,
+                                                 -8.1238936393894255E+03,
+                                             },
+                                             {
+                                                 7.8515926628982663E+02,
+                                                 -6.6607899119372642E+03,
+                                                 2.0167398338513311E+04,
+                                                 -2.8951401344519112E+04,
+                                                 1.4622828142848679E+04,
+                                                 1.4622828143544031E+04,
+                                                 -2.8951401346900999E+04,
+                                                 2.0167398338398041E+04,
+                                                 -6.6607899119505255E+03,
+                                                 7.8515926628967964E+02,
+                                             },
+                                             {
+                                                 -1.0147176570537010E+02,
+                                                 -3.5304284185385157E+01,
+                                                 1.3576976854876134E+03,
+                                                 -4.3921059353471856E+03,
+                                                 7.3232085271125388E+03,
+                                                 -7.3232085273978546E+03,
+                                                 4.3921059367737662E+03,
+                                                 -1.3576976854043962E+03,
+                                                 3.5304284185385157E+01,
+                                                 1.0147176570550941E+02,
+                                             },
+                                             {
+                                                 -4.3161545259389186E+01,
+                                                 1.5498490981579428E+02,
+                                                 -3.1771250774232175E+02,
+                                                 3.7215448796427023E+02,
+                                                 -1.7181762832770994E+02,
+                                                 -1.7181763036843782E+02,
+                                                 3.7215448789408123E+02,
+                                                 -3.1771250773692140E+02,
+                                                 1.5498490982186786E+02,
+                                                 -4.3161545259547800E+01,
+                                             },
+                                             {
+                                                 -4.2916172038214198E+00,
+                                                 1.7402146071148604E+01,
+                                                 -4.7947588069135868E+01,
+                                                 9.2697698088029625E+01,
+                                                 -1.2821427596894478E+02,
+                                                 1.2821427705670308E+02,
+                                                 -9.2697698297776569E+01,
+                                                 4.7947588093524907E+01,
+                                                 -1.7402146074502035E+01,
+                                                 4.2916172038452141E+00,
+                                             }}};
+  } else if constexpr (w == 11) {
+    return std::array<std::array<T, w>, nc>{{{
+                                                 3.7794653219809625E+04,
+                                                 3.4782300224660739E+07,
+                                                 1.6188020733727551E+09,
+                                                 1.7196758809615005E+10,
+                                                 6.3754384857724617E+10,
+                                                 9.7196447559193497E+10,
+                                                 6.3754384857724617E+10,
+                                                 1.7196758809614998E+10,
+                                                 1.6188020733727560E+09,
+                                                 3.4782300224660769E+07,
+                                                 3.7794653219808984E+04,
+                                             },
+                                             {
+                                                 1.8969206922085886E+05,
+                                                 8.4769319065313652E+07,
+                                                 2.4230555767723408E+09,
+                                                 1.5439732722639101E+10,
+                                                 2.7112836839612309E+10,
+                                                 2.5609833368650835E-06,
+                                                 -2.7112836839612328E+10,
+                                                 -1.5439732722639105E+10,
+                                                 -2.4230555767723408E+09,
+                                                 -8.4769319065313682E+07,
+                                                 -1.8969206922085711E+05,
+                                             },
+                                             {
+                                                 4.2138380313901440E+05,
+                                                 9.2050522922791913E+07,
+                                                 1.5259983101266613E+09,
+                                                 4.7070559561237173E+09,
+                                                 -1.2448027572952359E+09,
+                                                 -1.0161446790279301E+10,
+                                                 -1.2448027572952316E+09,
+                                                 4.7070559561237268E+09,
+                                                 1.5259983101266615E+09,
+                                                 9.2050522922791913E+07,
+                                                 4.2138380313901149E+05,
+                                             },
+                                             {
+                                                 5.4814313598122005E+05,
+                                                 5.8085130777589552E+07,
+                                                 4.9484006166551048E+08,
+                                                 1.6222124676640952E+08,
+                                                 -2.0440440381345339E+09,
+                                                 9.1416457449079640E-06,
+                                                 2.0440440381345336E+09,
+                                                 -1.6222124676640788E+08,
+                                                 -4.9484006166551071E+08,
+                                                 -5.8085130777589560E+07,
+                                                 -5.4814313598121714E+05,
+                                             },
+                                             {
+                                                 4.6495183529254980E+05,
+                                                 2.3067199578027144E+07,
+                                                 6.9832590192482382E+07,
+                                                 -2.2024799260683522E+08,
+                                                 -1.2820270942588677E+08,
+                                                 5.1017181199129778E+08,
+                                                 -1.2820270942588474E+08,
+                                                 -2.2024799260683942E+08,
+                                                 6.9832590192482322E+07,
+                                                 2.3067199578027155E+07,
+                                                 4.6495183529254742E+05,
+                                             },
+                                             {
+                                                 2.7021781043532980E+05,
+                                                 5.6764510325100143E+06,
+                                                 -5.5650761736748898E+06,
+                                                 -3.9907385617900200E+07,
+                                                 7.2453390663687646E+07,
+                                                 1.2300109686762266E-05,
+                                                 -7.2453390663684472E+07,
+                                                 3.9907385617899075E+07,
+                                                 5.5650761736749066E+06,
+                                                 -5.6764510325099993E+06,
+                                                 -2.7021781043532846E+05,
+                                             },
+                                             {
+                                                 1.0933249308680627E+05,
+                                                 6.9586821127987828E+05,
+                                                 -3.6860240321937902E+06,
+                                                 2.7428169457736355E+06,
+                                                 8.3392008440593518E+06,
+                                                 -1.6402201025046850E+07,
+                                                 8.3392008440698013E+06,
+                                                 2.7428169457778852E+06,
+                                                 -3.6860240321937371E+06,
+                                                 6.9586821127989423E+05,
+                                                 1.0933249308680571E+05,
+                                             },
+                                             {
+                                                 3.0203516161820498E+04,
+                                                 -3.6879059542768438E+04,
+                                                 -4.1141031216788280E+05,
+                                                 1.4111389975267777E+06,
+                                                 -1.5914376635331670E+06,
+                                                 9.4095582602103753E-06,
+                                                 1.5914376635379130E+06,
+                                                 -1.4111389975247320E+06,
+                                                 4.1141031216776522E+05,
+                                                 3.6879059542750314E+04,
+                                                 -3.0203516161820549E+04,
+                                             },
+                                             {
+                                                 5.1670143574922731E+03,
+                                                 -2.8613147115372190E+04,
+                                                 4.3560195427081359E+04,
+                                                 4.8438679582765450E+04,
+                                                 -2.5856630639231802E+05,
+                                                 3.7994883866738499E+05,
+                                                 -2.5856630640319458E+05,
+                                                 4.8438679579510936E+04,
+                                                 4.3560195426766244E+04,
+                                                 -2.8613147115376054E+04,
+                                                 5.1670143574922913E+03,
+                                             },
+                                             {
+                                                 3.0888018539740131E+02,
+                                                 -3.7949446187471626E+03,
+                                                 1.4313303204988082E+04,
+                                                 -2.6681600235594462E+04,
+                                                 2.3856005166166615E+04,
+                                                 8.6424601730164351E-06,
+                                                 -2.3856005155895236E+04,
+                                                 2.6681600234453199E+04,
+                                                 -1.4313303205083188E+04,
+                                                 3.7949446187583080E+03,
+                                                 -3.0888018539728523E+02,
+                                             },
+                                             {
+                                                 -8.3747489794189363E+01,
+                                                 1.1948077479405792E+02,
+                                                 4.8528498015072080E+02,
+                                                 -2.5024391114755094E+03,
+                                                 5.3511195318669425E+03,
+                                                 -6.7655484107390166E+03,
+                                                 5.3511195362291774E+03,
+                                                 -2.5024391131167667E+03,
+                                                 4.8528498019392708E+02,
+                                                 1.1948077480620087E+02,
+                                                 -8.3747489794426258E+01,
+                                             },
+                                             {
+                                                 -2.2640047135517630E+01,
+                                                 9.0840898563949466E+01,
+                                                 -2.1597187544386938E+02,
+                                                 3.1511229111443720E+02,
+                                                 -2.4856617998395282E+02,
+                                                 6.1683918215190516E-06,
+                                                 2.4856618439352349E+02,
+                                                 -3.1511228757800421E+02,
+                                                 2.1597187557069353E+02,
+                                                 -9.0840898570046704E+01,
+                                                 2.2640047135565219E+01,
+                                             },
+                                             {
+                                                 -1.6306382886201207E+00,
+                                                 7.3325946591320434E+00,
+                                                 -2.3241017682854558E+01,
+                                                 5.1715494398901185E+01,
+                                                 -8.2673000279130790E+01,
+                                                 9.6489719151212370E+01,
+                                                 -8.2673010381149226E+01,
+                                                 5.1715494328769353E+01,
+                                                 -2.3241018024860580E+01,
+                                                 7.3325946448852415E+00,
+                                                 -1.6306382886460551E+00,
+                                             }}};
+  } else if constexpr (w == 12) {
+    return std::array<std::array<T, w>, nc>{
+        {{6.1722991679852908E+04, 8.4789650417103648E+07, 5.4431675199498701E+09,
+          7.8788892335272232E+10, 4.0355760945670044E+11, 8.8071481911347949E+11,
+          8.8071481911347961E+11, 4.0355760945670044E+11, 7.8788892335272430E+10,
+          5.4431675199498835E+09, 8.4789650417103708E+07, 6.1722991679871957E+04},
+         {3.2561466099406168E+05, 2.2112758120210618E+08, 8.9911609880089817E+09,
+          8.3059508064200943E+10, 2.3965569143469864E+11, 1.6939286803305212E+11,
+          -1.6939286803305203E+11, -2.3965569143469864E+11, -8.3059508064201080E+10,
+          -8.9911609880089989E+09, -2.2112758120210618E+08, -3.2561466099404311E+05},
+         {7.6621098001581512E+05, 2.6026568260310286E+08, 6.4524338253008652E+09,
+          3.3729904113826820E+10, 2.8555202212474091E+10, -6.8998572040731537E+10,
+          -6.8998572040731445E+10, 2.8555202212474079E+10, 3.3729904113826824E+10,
+          6.4524338253008757E+09, 2.6026568260310274E+08, 7.6621098001583829E+05},
+         {1.0657807616803218E+06, 1.8144472126890984E+08, 2.5524827004349842E+09,
+          5.2112383911371660E+09, -1.0268350564014645E+10, -1.4763245309081306E+10,
+          1.4763245309081314E+10, 1.0268350564014671E+10, -5.2112383911371059E+09,
+          -2.5524827004349871E+09, -1.8144472126890984E+08, -1.0657807616803099E+06},
+         {9.7829638830158755E+05, 8.2222351241519913E+07, 5.5676911894064474E+08,
+          -4.8739037675427330E+08, -2.7153428193078227E+09, 2.5627633609246106E+09,
+          2.5627633609246163E+09, -2.7153428193078651E+09, -4.8739037675430620E+08,
+          5.5676911894064546E+08, 8.2222351241519868E+07, 9.7829638830161188E+05},
+         {6.2536876825114002E+05, 2.4702814073680203E+07, 4.1488431554846466E+07,
+          -2.9274790542418826E+08, 1.0742154109191516E+08, 6.2185168968032193E+08,
+          -6.2185168968012476E+08, -1.0742154109184742E+08, 2.9274790542423087E+08,
+          -4.1488431554843128E+07, -2.4702814073680237E+07, -6.2536876825112454E+05},
+         {2.8527714307528478E+05, 4.6266378435690766E+06, -1.0665598090790771E+07,
+          -2.6048960239891130E+07, 9.1597254427317813E+07, -5.9794495983264342E+07,
+          -5.9794495983220413E+07, 9.1597254427343085E+07, -2.6048960239921503E+07,
+          -1.0665598090794146E+07, 4.6266378435690673E+06, 2.8527714307530399E+05},
+         {9.2873647411234080E+04, 3.6630046787425119E+05, -3.1271047224730137E+06,
+          4.8612412939252760E+06, 3.3820440907796426E+06, -1.6880127953704204E+07,
+          1.6880127953756198E+07, -3.3820440907614031E+06, -4.8612412938993908E+06,
+          3.1271047224752530E+06, -3.6630046787425695E+05, -9.2873647411217215E+04},
+         {2.0817947751046438E+04, -5.5660303410315042E+04, -1.9519783923444615E+05,
+          1.0804817251338551E+06, -1.8264985852555393E+06, 9.7602844968061335E+05,
+          9.7602844962902542E+05, -1.8264985852963410E+06, 1.0804817251124913E+06,
+          -1.9519783923503032E+05, -5.5660303410363231E+04, 2.0817947751063632E+04},
+         {2.7986023314783361E+03, -1.9404411093655592E+04, 4.3922625000519314E+04,
+          -7.6450317451901383E+03, -1.5273911974273989E+05, 3.3223441458516393E+05,
+          -3.3223441441930021E+05, 1.5273911979752057E+05, 7.6450317512768806E+03,
+          -4.3922624998141677E+04, 1.9404411093637758E+04, -2.7986023314644049E+03},
+         {6.7849020474048089E+01, -1.7921351308204744E+03, 8.4980694686552797E+03,
+          -1.9742624859769410E+04, 2.4620674845030797E+04, -1.1676544851227827E+04,
+          -1.1676544869194569E+04, 2.4620674845030626E+04, -1.9742624831436660E+04,
+          8.4980694630406069E+03, -1.7921351308312935E+03, 6.7849020488592075E+01},
+         {-5.4577020998836872E+01, 1.3637112867242237E+02, 4.5513616580246023E+01,
+          -1.1174001367986359E+03, 3.2018769312434206E+03, -5.0580351396215219E+03,
+          5.0580351683422405E+03, -3.2018769242193171E+03, 1.1174000998831286E+03,
+          -4.5513609243969356E+01, -1.3637112867730119E+02, 5.4577021011726984E+01},
+         {-1.0538365872268786E+01, 4.6577222488645518E+01, -1.2606964198473415E+02,
+          2.1881091668968099E+02, -2.3273399614976032E+02, 1.0274275204276027E+02,
+          1.0274270265494516E+02, -2.3273401859852868E+02, 2.1881091865396468E+02,
+          -1.2606964777237258E+02, 4.6577222453584369E+01, -1.0538365860573146E+01},
+         {-4.6087004144309118E-01, 2.5969759128998060E+00, -9.6946932216381381E+00,
+          2.4990041962121211E+01, -4.6013909139329137E+01, 6.2056985032913090E+01,
+          -6.2056925855365186E+01, 4.6013921000662158E+01, -2.4990037445376750E+01,
+          9.6946954085586885E+00, -2.5969759201692755E+00, 4.6087004744129911E-01}}};
+  } else if constexpr (w == 13) {
+    return std::array<std::array<T, w>, nc>{
+        {{9.8715725867495363E+04, 1.9828875496808097E+08, 1.7196758809614983E+10,
+          3.3083776881353577E+11, 2.2668873993375439E+12, 6.7734720591167568E+12,
+          9.6695220682534785E+12, 6.7734720591167432E+12, 2.2668873993375430E+12,
+          3.3083776881353503E+11, 1.7196758809614998E+10, 1.9828875496807891E+08,
+          9.8715725867496090E+04},
+         {5.4491110456935549E+05, 5.4903670125539351E+08, 3.0879465445278183E+10,
+          3.9588436413399969E+11, 1.6860562536749778E+12, 2.4256447893117891E+12,
+          -5.5583944938791784E-05, -2.4256447893117847E+12, -1.6860562536749768E+12,
+          -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538898E+08,
+          -5.4491110456935526E+05},
+         {1.3504711883426071E+06, 6.9286979077463162E+08, 2.4618123595484577E+10,
+          1.9493985627722607E+11, 3.9422703517046350E+11, -1.8678883613919861E+11,
+          -8.5538079834550110E+11, -1.8678883613919730E+11, 3.9422703517046375E+11,
+          1.9493985627722589E+11, 2.4618123595484566E+10, 6.9286979077462614E+08,
+          1.3504711883426069E+06},
+         {1.9937206140846491E+06, 5.2512029493765980E+08, 1.1253303793811750E+10,
+          4.6205527735932152E+10, -1.1607472377983305E+10, -1.6305241755642313E+11,
+          3.5385440504350348E-04, 1.6305241755642365E+11, 1.1607472377982582E+10,
+          -4.6205527735932213E+10, -1.1253303793811750E+10, -5.2512029493765628E+08,
+          -1.9937206140846489E+06},
+         {1.9607419630386413E+06, 2.6425362558103892E+08, 3.1171259341747193E+09,
+          2.9839860297839913E+09, -1.9585031917561897E+10, -5.0666917387065792E+09,
+          3.6568794485480583E+10, -5.0666917387057562E+09, -1.9585031917561817E+10,
+          2.9839860297838497E+09, 3.1171259341747184E+09, 2.6425362558103728E+08,
+          1.9607419630386417E+06},
+         {1.3593773865640305E+06, 9.1556445104158267E+07, 4.7074012944133747E+08,
+          -1.1192579335657008E+09, -2.1090780087868555E+09, 5.2270306737951984E+09,
+          5.6467240041521856E-04, -5.2270306737934217E+09, 2.1090780087880819E+09,
+          1.1192579335658383E+09, -4.7074012944133127E+08, -9.1556445104157984E+07,
+          -1.3593773865640305E+06},
+         {6.8417206432039209E+05, 2.1561705510027152E+07, 7.5785249893055111E+06,
+          -2.7456096030221754E+08, 3.4589095671054310E+08, 4.0256106808894646E+08,
+          -1.0074306926603404E+09, 4.0256106809081393E+08, 3.4589095670997137E+08,
+          -2.7456096030236483E+08, 7.5785249893030487E+06, 2.1561705510027405E+07,
+          6.8417206432039209E+05},
+         {2.5248269397037517E+05, 3.0985559672616189E+06, -1.1816517087616559E+07,
+          -8.2958498770184973E+06, 8.0546642347355247E+07, -1.0594657799485898E+08,
+          2.1816722293163801E-04, 1.0594657799424352E+08, -8.0546642347497791E+07,
+          8.2958498771036500E+06, 1.1816517087615721E+07, -3.0985559672621777E+06,
+          -2.5248269397037517E+05},
+         {6.7530100970876694E+04, 1.2373362326658823E+05, -2.1245597183281910E+06,
+          5.1047323238754412E+06, -1.4139444405488928E+06, -1.1818267555096827E+07,
+          2.0121548578624789E+07, -1.1818267557079868E+07, -1.4139444401348191E+06,
+          5.1047323236516044E+06, -2.1245597183309775E+06, 1.2373362326702787E+05,
+          6.7530100970876316E+04},
+         {1.2421368748961073E+04, -5.0576243647011936E+04, -4.8878193436902722E+04,
+          6.5307896872028301E+05, -1.5497610127060430E+06, 1.5137725917321201E+06,
+          4.1615986404011299E-04, -1.5137725918538549E+06, 1.5497610130469005E+06,
+          -6.5307896856811445E+05, 4.8878193438804832E+04, 5.0576243646433126E+04,
+          -1.2421368748961073E+04},
+         {1.2904654687550299E+03, -1.1169946055009055E+04, 3.3275109713863385E+04,
+          -3.1765222274236821E+04, -5.9810982085323274E+04, 2.2355863038592847E+05,
+          -3.1083591705219547E+05, 2.2355863445202672E+05, -5.9810982721084511E+04,
+          -3.1765222464963932E+04, 3.3275109714208855E+04, -1.1169946054555618E+04,
+          1.2904654687545376E+03},
+         {-1.9043622268674213E+01, -6.8296542209516542E+02, 4.2702512274202591E+03,
+          -1.2165497317825058E+04, 1.9423733298269544E+04, -1.6010024066956401E+04,
+          3.4018642874429026E-04, 1.6010021599471667E+04, -1.9423732817821805E+04,
+          1.2165497483905752E+04, -4.2702512286689680E+03, 6.8296542153908558E+02,
+          1.9043622268312891E+01},
+         {-3.0093984465361217E+01, 9.8972865724808671E+01, -9.7437038666761538E+01,
+          -3.5079928405373198E+02, 1.5699250566648977E+03, -3.1287439837941820E+03,
+          3.8692196309709061E+03, -3.1287462825615335E+03, 1.5699252631958864E+03,
+          -3.5079944793112952E+02, -9.7437041893750632E+01, 9.8972866189610414E+01,
+          -3.0093984465884773E+01},
+         {-4.3050286009489040E+00, 2.1108975724659501E+01, -6.4297198812570272E+01,
+          1.2922884632277874E+02, -1.6991812716212596E+02, 1.2655005901719436E+02,
+          9.2483537895948854E-05, -1.2655066232531748E+02, 1.6991805207569072E+02,
+          -1.2922893667436634E+02, 6.4297198424711908E+01, -2.1108976207523057E+01,
+          4.3050286009485790E+00},
+         {-1.0957333716725008E-01, 7.2949317004436565E-01, -3.4300816058693728E+00,
+          1.0470054474579324E+01, -2.2292134950656113E+01, 3.4570827323582719E+01,
+          -3.9923523442753932E+01, 3.4573264959502886E+01, -2.2292358612963266E+01,
+          1.0470042004916014E+01, -3.4300810538570281E+00, 7.2949352113279253E-01,
+          -1.0957333740315604E-01}}};
+  } else if constexpr (w == 14) {
+    return std::array<std::array<T, w>, nc>{
+        {{1.5499533202966207E+05, 4.4723032442444688E+08, 5.1495083701694740E+10,
+          1.2904576022918071E+12, 1.1534950432785506E+13, 4.5650102198520484E+13,
+          8.8830582190032641E+13, 8.8830582190032641E+13, 4.5650102198520492E+13,
+          1.1534950432785527E+13, 1.2904576022918074E+12, 5.1495083701695107E+10,
+          4.4723032442444855E+08, 1.5499533202970232E+05},
+         {8.9188339002980455E+05, 1.3065352538728635E+09, 9.9400185225815567E+10,
+          1.7136059013402405E+12, 1.0144146621675832E+13, 2.3034036018490715E+13,
+          1.4630967270448871E+13, -1.4630967270448855E+13, -2.3034036018490719E+13,
+          -1.0144146621675846E+13, -1.7136059013402405E+12, -9.9400185225815964E+10,
+          -1.3065352538728662E+09, -8.9188339002979454E+05},
+         {2.3170473769379663E+06, 1.7532505043698256E+09, 8.6523535958354309E+10,
+          9.7455289065487354E+11, 3.2977972139362314E+12, 1.7874626001697781E+12,
+          -6.1480918082633916E+12, -6.1480918082633975E+12, 1.7874626001697690E+12,
+          3.2977972139362285E+12, 9.7455289065487329E+11, 8.6523535958354630E+10,
+          1.7532505043698275E+09, 2.3170473769380399E+06},
+         {3.6089249230396422E+06, 1.4278058213962190E+09, 4.4296625537022423E+10,
+          2.9466624630419781E+11, 3.1903621584503235E+11, -9.8834691411254565E+11,
+          -1.1072264714919226E+12, 1.1072264714919316E+12, 9.8834691411255151E+11,
+          -3.1903621584503467E+11, -2.9466624630419769E+11, -4.4296625537022621E+10,
+          -1.4278058213962219E+09, -3.6089249230396664E+06},
+         {3.7733555140851745E+06, 7.8376718099107409E+08, 1.4443117772349569E+10,
+          4.3197433307418671E+10, -7.6585042240585556E+10, -1.8569640140763062E+11,
+          2.0385335192657199E+11, 2.0385335192656519E+11, -1.8569640140762662E+11,
+          -7.6585042240580856E+10, 4.3197433307418686E+10, 1.4443117772349669E+10,
+          7.8376718099107552E+08, 3.7733555140852560E+06},
+         {2.8079157920112358E+06, 3.0340753492383724E+08, 2.9498136661747241E+09,
+          -6.2820200387919831E+08, -2.2372008390623215E+10, 1.5217518660584890E+10,
+          4.0682590266891922E+10, -4.0682590266869431E+10, -1.5217518660582748E+10,
+          2.2372008390625935E+10, 6.2820200387968791E+08, -2.9498136661747637E+09,
+          -3.0340753492383808E+08, -2.8079157920112377E+06},
+         {1.5361613559533111E+06, 8.3513615594416574E+07, 3.0077547202708024E+08,
+          -1.3749596754067802E+09, -6.6733027297557127E+08, 5.9590333632819109E+09,
+          -4.3025685566870070E+09, -4.3025685566872711E+09, 5.9590333632806673E+09,
+          -6.6733027297523963E+08, -1.3749596754067125E+09, 3.0077547202709383E+08,
+          8.3513615594416171E+07, 1.5361613559533576E+06},
+         {6.2759409419592959E+05, 1.5741723594963098E+07, -1.5632610223406436E+07,
+          -1.9294824907078514E+08, 4.4643806532434595E+08, 1.5178998385244830E+07,
+          -9.6771139891725647E+08, 9.6771139892509627E+08, -1.5178998381042883E+07,
+          -4.4643806533176166E+08, 1.9294824907065383E+08, 1.5632610223392555E+07,
+          -1.5741723594963137E+07, -6.2759409419590747E+05},
+         {1.9151404903933613E+05, 1.7156606891563335E+06, -9.7733523156688716E+06,
+          4.2982266233154163E+06, 5.1660907884347722E+07, -1.1279400211155911E+08,
+          6.4701089573962681E+07, 6.4701089571562663E+07, -1.1279400211012064E+08,
+          5.1660907891220264E+07, 4.2982266233826512E+06, -9.7733523157112263E+06,
+          1.7156606891560503E+06, 1.9151404903936724E+05},
+         {4.2715272622845026E+04, -2.2565910611953568E+03, -1.1769776156959014E+06,
+          4.0078399907813077E+06, -3.8951858063335596E+06, -5.0944610754510267E+06,
+          1.6765992446914168E+07, -1.6765992426657490E+07, 5.0944610781778870E+06,
+          3.8951858062361716E+06, -4.0078399907326135E+06, 1.1769776157141617E+06,
+          2.2565910606306688E+03, -4.2715272622820135E+04},
+         {6.4806786522793900E+03, -3.5474227032974472E+04, 1.8237100709385861E+04,
+          3.0934714629696816E+05, -1.0394703931686131E+06, 1.4743920333143482E+06,
+          -7.3356882447856572E+05, -7.3356882916658197E+05, 1.4743920305501707E+06,
+          -1.0394703929917105E+06, 3.0934714631908614E+05, 1.8237100665157792E+04,
+          -3.5474227033406372E+04, 6.4806786523010323E+03},
+         {4.9913632908459954E+02, -5.5416668524952684E+03, 2.0614058717617296E+04,
+          -3.2285139072943130E+04, -5.3099550821623425E+03, 1.1559000502166932E+05,
+          -2.2569743259261423E+05, 2.2569743616896842E+05, -1.1559000130545651E+05,
+          5.3099543129458480E+03, 3.2285139142872020E+04, -2.0614058670790018E+04,
+          5.5416668533342381E+03, -4.9913632906195977E+02},
+         {-3.3076333188134086E+01, -1.8970588563697331E+02, 1.8160423493164808E+03,
+          -6.3715703355644328E+03, 1.2525624574329036E+04, -1.4199806452802783E+04,
+          6.4441892296909591E+03, 6.4441909537524216E+03, -1.4199808176873401E+04,
+          1.2525626154733827E+04, -6.3715704433222418E+03, 1.8160422729911850E+03,
+          -1.8970588700495102E+02, -3.3076333168231550E+01},
+         {-1.4394533627743886E+01, 5.7000699089242815E+01, -1.0101142663923416E+02,
+          -3.2954197414395189E+01, 6.1417879182394654E+02, -1.6177283846697430E+03,
+          2.4593386157454975E+03, -2.4593322941165261E+03, 1.6177291239900730E+03,
+          -6.1417952013923764E+02, 3.2954100943010943E+01, 1.0101142710333265E+02,
+          -5.7000699100179844E+01, 1.4394533639240331E+01},
+         {-1.5925952284027161E+00, 8.5113930215357829E+00, -2.8993523187012922E+01,
+          6.6373454994590404E+01, -1.0329574518449559E+02, 1.0280184257681817E+02,
+          -4.3896094875192006E+01, -4.3899302208087086E+01, 1.0280039795628096E+02,
+          -1.0329511291885207E+02, 6.6373435700858948E+01, -2.8993536490606409E+01,
+          8.5113924808491728E+00, -1.5925952194145006E+00},
+         {1.5984868520881029E-02, 1.2876175212962959E-01, -9.8358742969175483E-01,
+          3.7711523389360830E+00, -9.4305498095765508E+00, 1.6842854581416674E+01,
+          -2.2308566502972713E+01, 2.2308940200151390E+01, -1.6841512668820517E+01,
+          9.4313524091989347E+00, -3.7710716543179599E+00, 9.8361025494556609E-01,
+          -1.2876100566420701E-01, -1.5984859433053292E-02}}};
+  } else if constexpr (w == 15) {
+    return std::array<std::array<T, w>, nc>{
+        {{2.3939707792241839E+05, 9.7700272582690191E+08, 1.4715933396485257E+11,
+          4.7242424833337158E+12, 5.3987426629953594E+13, 2.7580474290566078E+14,
+          7.0693378336533400E+14, 9.6196578554477775E+14, 7.0693378336533400E+14,
+          2.7580474290566125E+14, 5.3987426629953766E+13, 4.7242424833337246E+12,
+          1.4715933396485263E+11, 9.7700272582690215E+08, 2.3939707792242285E+05},
+         {1.4314487885226035E+06, 2.9961416925358453E+09, 3.0273361232748438E+11,
+          6.8507333793903584E+12, 5.4192702756911000E+13, 1.7551587948105309E+14,
+          2.1874615668430150E+14, 3.4316191014053393E-02, -2.1874615668430150E+14,
+          -1.7551587948105334E+14, -5.4192702756911180E+13, -6.8507333793903701E+12,
+          -3.0273361232748438E+11, -2.9961416925358458E+09, -1.4314487885226049E+06},
+         {3.8829497354762917E+06, 4.2473082696966448E+09, 2.8414312556015540E+11,
+          4.3688281331121411E+12, 2.1823119508000543E+13, 3.2228098609392094E+13,
+          -2.1833085454691789E+13, -7.3750710225100812E+13, -2.1833085454691820E+13,
+          3.2228098609392055E+13, 2.1823119508000594E+13, 4.3688281331121479E+12,
+          2.8414312556015527E+11, 4.2473082696966434E+09, 3.8829497354762889E+06},
+         {6.3495763451755755E+06, 3.6841035003733950E+09, 1.5965774278321045E+11,
+          1.5630338683778201E+12, 3.8749058615819268E+12, -2.7319740087723574E+12,
+          -1.3233342822865402E+13, 6.1642230420317079E-02, 1.3233342822865449E+13,
+          2.7319740087723975E+12, -3.8749058615819365E+12, -1.5630338683778203E+12,
+          -1.5965774278321042E+11, -3.6841035003733935E+09, -6.3495763451755764E+06},
+         {7.0146619045520434E+06, 2.1782897863065763E+09, 5.8897780310148087E+10,
+          3.1953009601770325E+11, 4.0651527029737198E+08, -1.6379148273276064E+12,
+          -1.1568753137013029E+11, 2.7451653250460508E+12, -1.1568753137012485E+11,
+          -1.6379148273277261E+12, 4.0651527029819238E+08, 3.1953009601770361E+11,
+          5.8897780310148087E+10, 2.1782897863065763E+09, 7.0146619045520443E+06},
+         {5.5580012413990172E+06, 9.2345162185944164E+08, 1.4522950934020109E+10,
+          2.7025952371212009E+10, -1.2304576967641914E+11, -1.0116752717202786E+11,
+          3.8517418245458325E+11, 1.0918347404432817E-01, -3.8517418245444312E+11,
+          1.0116752717221135E+11, 1.2304576967643665E+11, -2.7025952371214943E+10,
+          -1.4522950934020079E+10, -9.2345162185944211E+08, -5.5580012413990181E+06},
+         {3.2693972344231778E+06, 2.8610260147425205E+08, 2.2348528403750563E+09,
+          -3.4574515574242272E+09, -1.7480626463583939E+10, 3.1608597465540653E+10,
+          1.9879262560072273E+10, -6.6148013553772224E+10, 1.9879262560085339E+10,
+          3.1608597465515747E+10, -1.7480626463576942E+10, -3.4574515574198236E+09,
+          2.2348528403750110E+09, 2.8610260147425193E+08, 3.2693972344231787E+06},
+         {1.4553539959296256E+06, 6.4136842048384041E+07, 1.3622336582062906E+08,
+          -1.2131510424644001E+09, 6.4322366984221375E+08, 4.5078753872047586E+09,
+          -7.1689413746930647E+09, 3.2906916833662987E-02, 7.1689413746724453E+09,
+          -4.5078753875009747E+09, -6.4322366985365331E+08, 1.2131510424608817E+09,
+          -1.3622336582067037E+08, -6.4136842048384242E+07, -1.4553539959296256E+06},
+         {4.9358776531681651E+05, 9.7772970960585065E+06, -2.3511574237987626E+07,
+          -1.0142613816641946E+08, 3.9421144218035364E+08, -2.8449115593052310E+08,
+          -5.7549243243741119E+08, 1.1608781631182449E+09, -5.7549243240763104E+08,
+          -2.8449115600447333E+08, 3.9421144214381480E+08, -1.0142613816429654E+08,
+          -2.3511574237995699E+07, 9.7772970960588697E+06, 4.9358776531681546E+05},
+         {1.2660319987326677E+05, 7.7519511328119377E+05, -6.5244610661450895E+06,
+          9.0878257488052379E+06, 2.3116605621149920E+07, -8.7079594462079599E+07,
+          9.5542733739275128E+07, 6.0548970733798724E-02, -9.5542733661364838E+07,
+          8.7079594608550951E+07, -2.3116605559600785E+07, -9.0878257522138134E+06,
+          6.5244610661298726E+06, -7.7519511328133650E+05, -1.2660319987326639E+05},
+         {2.3793325531458529E+04, -4.2305332803808597E+04, -5.2884156985535356E+05,
+          2.5307340127864038E+06, -4.0404175271559842E+06, -1.7519992360184138E+05,
+          1.0146438805818636E+07, -1.5828545480742473E+07, 1.0146438778928882E+07,
+          -1.7520004389869148E+05, -4.0404175770437294E+06, 2.5307340149977510E+06,
+          -5.2884156989405944E+05, -4.2305332803937294E+04, 2.3793325531459184E+04},
+         {2.9741655196834722E+03, -2.0687056403786246E+04, 3.3295507799709936E+04,
+          1.0661145730323243E+05, -5.6644238105382060E+05, 1.0874811616841732E+06,
+          -9.6561270266008016E+05, 1.5626594062671070E-02, 9.6561272951271443E+05,
+          -1.0874812528712249E+06, 5.6644243308078672E+05, -1.0661145838213131E+05,
+          -3.3295507812197495E+04, 2.0687056403630129E+04, -2.9741655196846405E+03},
+         {1.5389176594899303E+02, -2.3864418511494741E+03, 1.0846266954249364E+04,
+          -2.2940053396478714E+04, 1.4780106121058996E+04, 4.2663651769852157E+04,
+          -1.3047648013242516E+05, 1.7468401314164279E+05, -1.3047645484607235E+05,
+          4.2663541429144650E+04, 1.4780036296018619E+04, -2.2940053180976502E+04,
+          1.0846266927315819E+04, -2.3864418517113058E+03, 1.5389176594779781E+02},
+         {-2.3857631312588978E+01, -1.9651606133609231E+01, 6.4183083829803820E+02,
+          -2.8648433109641578E+03, 6.8249243722518859E+03, -9.7944325124827701E+03,
+          7.6177757600121276E+03, 1.8034307737205296E-02, -7.6177559127722052E+03,
+          9.7944326623113047E+03, -6.8249058342322496E+03, 2.8648407117981119E+03,
+          -6.4183085438795774E+02, 1.9651605969778377E+01, 2.3857631312809222E+01},
+         {-6.1348505739169541E+00, 2.7872915855267404E+01, -6.5819942538871970E+01,
+          5.1366231962952028E+01, 1.7213955398158618E+02, -6.9658621010000411E+02,
+          1.3192236112353403E+03, -1.6054106225233884E+03, 1.3192031991952242E+03,
+          -6.9663961216547739E+02, 1.7211403815802629E+02, 5.1367579954366171E+01,
+          -6.5819957939661379E+01, 2.7872915947616441E+01, -6.1348505735855374E+00},
+         {-4.9671584513490097E-01, 3.0617550953446115E+00, -1.1650665638578070E+01,
+          3.0081586723089057E+01, -5.4028356726202020E+01, 6.6077203078498044E+01,
+          -4.7145500171928198E+01, 4.2118837140985958E-03, 4.7167106663349848E+01,
+          -6.6048394423269173E+01, 5.4062906728994193E+01, -3.0081603709324451E+01,
+          1.1650672008416343E+01, -3.0617551285208524E+00, 4.9671584437353217E-01},
+         {4.3460786767313729E-03, -1.3199600771767199E-02, -1.9412688562910244E-01,
+          1.1329433700669471E+00, -3.4442045795063887E+00, 7.1737626956468912E+00,
+          -1.1098109271625262E+01, 1.2385772358881393E+01, -1.1101471316239516E+01,
+          7.0913926025978853E+00, -3.4845491148773502E+00, 1.1323523856621058E+00,
+          -1.9414904754428672E-01, -1.3200165079792004E-02, 4.3460782759443158E-03}}};
+  } else if constexpr (w == 16) {
+    return std::array<std::array<T, w>, nc>{
+        {{3.6434551345570839E+05, 2.0744705928579483E+09, 4.0355760945669995E+11,
+          1.6364575388763029E+13, 2.3514830376056538E+14, 1.5192201717462528E+15,
+          4.9956173084674090E+15, 8.9287666945127360E+15, 8.9287666945127390E+15,
+          4.9956173084674090E+15, 1.5192201717462528E+15, 2.3514830376056538E+14,
+          1.6364575388763035E+13, 4.0355760945670026E+11, 2.0744705928579524E+09,
+          3.6434551345571183E+05},
+         {2.2576246485480359E+06, 6.6499571180086451E+09, 8.7873753526056287E+11,
+          2.5606844387131066E+13, 2.6313738449330153E+14, 1.1495095100701460E+15,
+          2.1932582707747560E+15, 1.2860244365132595E+15, -1.2860244365132600E+15,
+          -2.1932582707747578E+15, -1.1495095100701465E+15, -2.6313738449330159E+14,
+          -2.5606844387131062E+13, -8.7873753526056299E+11, -6.6499571180086451E+09,
+          -2.2576246485480373E+06},
+         {6.3730995546265077E+06, 9.9060026035198078E+09, 8.8097248605449023E+11,
+          1.7953384130753688E+13, 1.2398425545001662E+14, 3.0749346493041262E+14,
+          1.0259777520247159E+14, -5.5291976457534325E+14, -5.5291976457534325E+14,
+          1.0259777520247186E+14, 3.0749346493041219E+14, 1.2398425545001659E+14,
+          1.7953384130753676E+13, 8.8097248605448950E+11, 9.9060026035198040E+09,
+          6.3730995546265030E+06},
+         {1.0896915393078227E+07, 9.0890343524593849E+09, 5.3565169504010010E+11,
+          7.3004206720038701E+12, 2.9692333044160066E+13, 1.6051737468109549E+13,
+          -9.1273329108089906E+13, -8.5999306918502953E+13, 8.5999306918502422E+13,
+          9.1273329108089984E+13, -1.6051737468109510E+13, -2.9692333044160082E+13,
+          -7.3004206720038701E+12, -5.3565169504010022E+11, -9.0890343524593849E+09,
+          -1.0896915393078227E+07},
+         {1.2655725616100594E+07, 5.7342804054544210E+09, 2.1822836608899570E+11,
+          1.8300700858999690E+12, 2.7770431049857676E+12, -8.5034969223852568E+12,
+          -1.2846668467423438E+13, 1.6519076896571838E+13, 1.6519076896572182E+13,
+          -1.2846668467423555E+13, -8.5034969223850703E+12, 2.7770431049857896E+12,
+          1.8300700858999678E+12, 2.1822836608899567E+11, 5.7342804054544210E+09,
+          1.2655725616100591E+07},
+         {1.0609303958036326E+07, 2.6255609052371716E+09, 6.1673589426039413E+10,
+          2.6044432099085333E+11, -3.5431628074578204E+11, -1.6077602129636348E+12,
+          1.5534405614728977E+12, 2.8019935380857432E+12, -2.8019935380841978E+12,
+          -1.5534405614724106E+12, 1.6077602129635625E+12, 3.5431628074580896E+11,
+          -2.6044432099084848E+11, -6.1673589426039429E+10, -2.6255609052371716E+09,
+          -1.0609303958036322E+07},
+         {6.6544809363384582E+06, 8.9490403680928326E+08, 1.1882638725190845E+10,
+          8.1552898137823076E+09, -1.2575562817886868E+11, 2.7074695075907585E+10,
+          3.9453789461955023E+11, -3.1679644857468066E+11, -3.1679644857392346E+11,
+          3.9453789461966650E+11, 2.7074695075992649E+10, -1.2575562817884555E+11,
+          8.1552898137788668E+09, 1.1882638725190889E+10, 8.9490403680928278E+08,
+          6.6544809363384554E+06},
+         {3.1906872142825006E+06, 2.2785946180651775E+08, 1.3744578972809248E+09,
+          -4.3997172592883167E+09, -9.2011130754043922E+09, 3.4690551711832901E+10,
+          -9.4227043395047741E+09, -5.9308465070198639E+10, 5.9308465069336540E+10,
+          9.4227043396350136E+09, -3.4690551711738396E+10, 9.2011130753567543E+09,
+          4.3997172592879610E+09, -1.3744578972813025E+09, -2.2785946180651844E+08,
+          -3.1906872142825015E+06},
+         {1.1821527096621769E+06, 4.2281234059839502E+07, 2.8723226058712766E+07,
+          -8.3553955857628822E+08, 1.2447304828823066E+09, 2.1955280943585949E+09,
+          -7.0514195726908512E+09, 4.3745141239718714E+09, 4.3745141233600502E+09,
+          -7.0514195728029747E+09, 2.1955280943510208E+09, 1.2447304828590808E+09,
+          -8.3553955857879233E+08, 2.8723226058761366E+07, 4.2281234059838109E+07,
+          1.1821527096621762E+06},
+         {3.3854610744280310E+05, 5.2176984975081543E+06, -2.0677283565079328E+07,
+          -3.5831818968518838E+07, 2.6599346106412742E+08, -3.7992777977357000E+08,
+          -1.3426914417466179E+08, 9.1752051229224503E+08, -9.1752051129499328E+08,
+          1.3426914497246322E+08, 3.7992777991069216E+08, -2.6599346104854536E+08,
+          3.5831818968908392E+07, 2.0677283564896725E+07, -5.2176984975075833E+06,
+          -3.3854610744279937E+05},
+         {7.3893334077310064E+04, 2.6983804209559254E+05, -3.6415998561101072E+06,
+          8.4025485849181097E+06, 4.9278860779345948E+06, -5.1437033846752726E+07,
+          8.7603898676325440E+07, -4.6199498412402093E+07, -4.6199498208604209E+07,
+          8.7603898435731798E+07, -5.1437033863736227E+07, 4.9278861005789889E+06,
+          8.4025485831489991E+06, -3.6415998560990733E+06, 2.6983804209473461E+05,
+          7.3893334077307401E+04},
+         {1.1778892113375481E+04, -4.0077190108724200E+04, -1.8372552175909068E+05,
+          1.3262878399160223E+06, -2.9738539927520575E+06, 1.9493509709529271E+06,
+          4.1881949951139782E+06, -1.1066749616505133E+07, 1.1066749327519676E+07,
+          -4.1881946843906553E+06, -1.9493507810665092E+06, 2.9738539818831389E+06,
+          -1.3262878384774840E+06, 1.8372552162922107E+05, 4.0077190107319519E+04,
+          -1.1778892113376129E+04},
+         {1.2019749667923656E+03, -1.0378455844500613E+04, 2.6333352653155256E+04,
+          1.7117060106301305E+04, -2.5133287443653666E+05, 6.4713914262131555E+05,
+          -8.1634942572553246E+05, 3.8623935281825601E+05, 3.8623876433339820E+05,
+          -8.1634960962672008E+05, 6.4713900469564367E+05, -2.5133289627502396E+05,
+          1.7117057951236206E+04, 2.6333352581335013E+04, -1.0378455846609291E+04,
+          1.2019749667911419E+03},
+         {3.1189837632471693E+01, -8.9083493807061564E+02, 4.9454293649337906E+03,
+          -1.3124693635095375E+04, 1.5834784331991095E+04, 6.9607870364081436E+03,
+          -5.9789871879430451E+04, 1.0841726514394575E+05, -1.0841709685990328E+05,
+          5.9790206615067997E+04, -6.9607049368128291E+03, -1.5834783935893831E+04,
+          1.3124692974990443E+04, -4.9454295091588992E+03, 8.9083493794871868E+02,
+          -3.1189837631106176E+01},
+         {-1.2975319073401824E+01, 1.8283698218710011E+01, 1.7684015393859755E+02,
+          -1.1059917445033070E+03, 3.1998168298121523E+03, -5.5988200120063057E+03,
+          5.9248751921324047E+03, -2.5990022806343668E+03, -2.5990962125709430E+03,
+          5.9247537039895724E+03, -5.5988835070734467E+03, 3.1998292349030621E+03,
+          -1.1059926481090836E+03, 1.7684013881079576E+02, 1.8283698123134819E+01,
+          -1.2975319073977776E+01},
+         {-2.3155118729954247E+00, 1.1938503634469159E+01, -3.4150562973753665E+01,
+          4.8898615554511437E+01, 1.5853185548633874E+01, -2.4272678107130790E+02,
+          6.0151276286907887E+02, -8.8751856926690448E+02, 8.8742942550355474E+02,
+          -6.0136491467620624E+02, 2.4282489356694586E+02, -1.5850195971204462E+01,
+          -4.8897392545563044E+01, 3.4150562973753665E+01, -1.1938504430698943E+01,
+          2.3155118723150525E+00},
+         {-1.5401723686076832E-01, 9.8067823888634464E-01, -4.1900843552415639E+00,
+          1.2150534299778382E+01, -2.4763139606227178E+01, 3.6068014621628578E+01,
+          -3.4346647779134791E+01, 1.3259903958585387E+01, 1.2937147675617604E+01,
+          -3.4454233206790519E+01, 3.6027670086257579E+01, -2.4769863695455662E+01,
+          1.2149431128889342E+01, -4.1901615115388706E+00, 9.8067695636810759E-01,
+          -1.5401723756214594E-01},
+         {1.1808835093099178E-02, -2.5444299558662394E-02, -1.5661344238792723E-04,
+          2.5820071204205225E-01, -1.0930950485268096E+00, 2.6408492552008669E+00,
+          -4.4415763059111955E+00, 6.8227366238712817E+00, -6.8186662643534008E+00,
+          4.4887924763186051E+00, -2.6327085361651021E+00, 1.0918739406714428E+00,
+          -2.5844238963842503E-01, 1.2680123888735934E-04, 2.5444206395526567E-02,
+          -1.1808834826225629E-02}}};
+  } else {
+    static_assert(w >= 2, "w must be >= 2");
+    static_assert(w <= 16, "w must be <= 16");
+    return {};
+  }
 };
-
-
diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp
index e07e76c02..8e55eab4a 100644
--- a/src/simpleinterfaces.cpp
+++ b/src/simpleinterfaces.cpp
@@ -1,8 +1,8 @@
 // public header
 #include <finufft.h>
 // private headers
-#include <finufft/defs.h>
 #include <cstdio>
+#include <finufft/defs.h>
 using namespace std;
 
 /* ---------------------------------------------------------------------------
@@ -18,281 +18,280 @@ using namespace std;
    ---------------------------------------------------------------------------
 */
 
-
 // Helper layer ...........................................................
 
 namespace finufft {
-  namespace common {
+namespace common {
 
-int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT* xj,
-                        FLT *yj, FLT *zj, CPX* cj,int iflag, FLT eps,
-                        BIGINT *n_modes, BIGINT nk, FLT *s, FLT *t,  FLT *u,
-                        CPX* fk, finufft_opts *popts)
+int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, FLT *yj,
+                        FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT *n_modes, BIGINT nk,
+                        FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *popts)
 // Helper layer between simple interfaces (with opts) and the guru functions.
 // Author: Andrea Malleo, 2019.
 {
   FINUFFT_PLAN plan;
-  int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps,
-                             &plan, popts);  // popts (ptr to opts) can be NULL
-  if (ier>1) {   // since 1 (a warning) still allows proceeding...
+  int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, &plan,
+                             popts); // popts
+                                     // (ptr
+                                     // to
+                                     // opts)
+                                     // can
+                                     // be
+                                     // NULL
+  if (ier > 1) {                     // since 1 (a warning) still allows proceeding...
     fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier);
     delete plan;
     return ier;
   }
 
   int ier2 = FINUFFT_SETPTS(plan, nj, xj, yj, zj, nk, s, t, u);
-  if (ier2>1) {
-    fprintf(stderr,"FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2);
+  if (ier2 > 1) {
+    fprintf(stderr, "FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2);
     FINUFFT_DESTROY(plan);
     return ier2;
   }
 
   int ier3 = FINUFFT_EXECUTE(plan, cj, fk);
-  if (ier3>1) {
-    fprintf(stderr,"FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3);
+  if (ier3 > 1) {
+    fprintf(stderr, "FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3);
     FINUFFT_DESTROY(plan);
     return ier3;
   }
 
   FINUFFT_DESTROY(plan);
-  return max(max(ier,ier2),ier3);  // in case any one gave a (positive!) warning
+  return max(max(ier, ier2), ier3); // in case any one gave a (positive!) warning
 }
 
-  }       // namespace
-}       // namespace
+} // namespace common
+} // namespace finufft
 
 using namespace finufft::common;
 
-
 // Dimension 1111111111111111111111111111111111111111111111111111111111111111
 
-int FINUFFT1D1(BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT1D1(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk,
+               finufft_opts *opts)
 //  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,1,1};
-  int n_dims = 1;
-  int n_transf = 1;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-			 iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, 1, 1};
+  int n_dims       = 1;
+  int n_transf     = 1;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D1MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,
-                   BIGINT ms, CPX* fk, finufft_opts *opts)
+int FINUFFT1D1MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
+                   BIGINT ms, CPX *fk, finufft_opts *opts)
 // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,1,1};
-  int n_dims = 1;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-		      iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, 1, 1};
+  int n_dims       = 1;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D2(BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT1D2(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk,
+               finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,1,1};
-  int n_dims = 1;
-  int n_transf = 1;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-			  iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, 1, 1};
+  int n_dims       = 1;
+  int n_transf     = 1;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D2MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT1D2MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
+                   BIGINT ms, CPX *fk, finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,1,1};
-  int n_dims = 1;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-		      	iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, 1, 1};
+  int n_dims       = 1;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D3(BIGINT nj,FLT* xj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, CPX* fk, finufft_opts *opts)
+int FINUFFT1D3(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s,
+               CPX *fk, finufft_opts *opts)
 // Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
 {
-  int n_dims = 1;
+  int n_dims   = 1;
   int n_transf = 1;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-				iflag, eps, NULL, nk, s, NULL, NULL, fk, opts);
+  int type     = 3;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, NULL, nk, s, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D3MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, CPX* fk, finufft_opts *opts)
-  // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
+int FINUFFT1D3MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
+                   BIGINT nk, FLT *s, CPX *fk, finufft_opts *opts)
+// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
   int n_dims = 1;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-				iflag, eps, NULL, nk, s, NULL, NULL, fk, opts);
+  int type   = 3;
+  int ier    = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                   eps, NULL, nk, s, NULL, NULL, fk, opts);
   return ier;
 }
 
-
 // Dimension 22222222222222222222222222222222222222222222222222222222222222222
 
-int FINUFFT2D1(BIGINT nj,FLT* xj,FLT *yj,CPX* cj,int iflag,
-	       FLT eps, BIGINT ms, BIGINT mt, CPX* fk, finufft_opts* opts)
+int FINUFFT2D1(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms,
+               BIGINT mt, CPX *fk, finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,1};
-  int n_dims = 2;
-  int n_transf = 1;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj,
-                          iflag, eps, n_modes, 0, NULL, NULL, NULL,fk, opts);
+  BIGINT n_modes[] = {ms, mt, 1};
+  int n_dims       = 2;
+  int n_transf     = 1;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT* xj, FLT *yj, CPX* c,
-		   int iflag, FLT eps, BIGINT ms, BIGINT mt, CPX* fk,
-		   finufft_opts *opts)
+int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps,
+                   BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,1};
-  int n_dims = 2;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj,NULL, c,
-                        iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, 1};
+  int n_dims       = 2;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D2(BIGINT nj,FLT* xj,FLT *yj,CPX* cj,int iflag,FLT eps,
-	       BIGINT ms, BIGINT mt, CPX* fk, finufft_opts *opts)
+int FINUFFT2D2(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms,
+               BIGINT mt, CPX *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,1};
-  int n_dims = 2;
-  int n_transf = 1;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, 1};
+  int n_dims       = 2;
+  int n_transf     = 1;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT* xj, FLT *yj, CPX* c, int iflag,
-		   FLT eps, BIGINT ms, BIGINT mt, CPX* fk, finufft_opts *opts)
+int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps,
+                   BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,1};
-  int n_dims = 2;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, 1};
+  int n_dims       = 2;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D3(BIGINT nj,FLT* xj,FLT* yj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t, CPX* fk, finufft_opts *opts)
+int FINUFFT2D3(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk,
+               FLT *s, FLT *t, CPX *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
 {
-  int n_dims = 2;
-  int type = 3;
+  int n_dims   = 2;
+  int type     = 3;
   int n_transf = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj,iflag, eps, NULL, nk, s,t,NULL, fk, opts);
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
+                                NULL, nk, s, t, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D3MANY(int n_transf, BIGINT nj,FLT* xj,FLT* yj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t, CPX* fk, finufft_opts *opts)
+int FINUFFT2D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps,
+                   BIGINT nk, FLT *s, FLT *t, CPX *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
   int n_dims = 2;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj,iflag, eps, NULL, nk, s,t,NULL, fk, opts);
+  int type   = 3;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
+                                NULL, nk, s, t, NULL, fk, opts);
   return ier;
 }
 
-
-
 // Dimension 3333333333333333333333333333333333333333333333333333333333333333
 
-int FINUFFT3D1(BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,int iflag,
-	       FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX* fk,
-	       finufft_opts *opts)
+int FINUFFT3D1(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
+               BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
 //  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,mu};
-  int n_dims = 3;
-  int n_transf = 1;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, mu};
+  int n_dims       = 3;
+  int n_transf     = 1;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-
-int FINUFFT3D1MANY(int n_transf, BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,
-                   int iflag, FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX* fk,
-                   finufft_opts *opts)
+int FINUFFT3D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
+                   FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
 // Type-1 3D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,mu};
-  int n_dims = 3;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, n_modes, 0,  NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, mu};
+  int n_dims       = 3;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT3D2(BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,
-	       int iflag,FLT eps, BIGINT ms, BIGINT mt, BIGINT mu,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT3D2(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
+               BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,mu};
-  int n_dims = 3;
-  int n_transf = 1;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, mu};
+  int n_dims       = 3;
+  int n_transf     = 1;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT3D2MANY(int n_transf, BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,
-	       int iflag,FLT eps, BIGINT ms, BIGINT mt, BIGINT mu,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT3D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
+                   FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,mu};
-  n_modes[0] = ms;
-  n_modes[1] = mt;
-  n_modes[2] = mu;
-  int n_dims = 3;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, mu};
+  n_modes[0]       = ms;
+  n_modes[1]       = mt;
+  n_modes[2]       = mu;
+  int n_dims       = 3;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT3D3(BIGINT nj,FLT* xj,FLT* yj,FLT *zj, CPX* cj,
-	       int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t,
-	       FLT *u, CPX* fk, finufft_opts *opts)
+int FINUFFT3D3(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
+               BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
-  int n_dims = 3;
+  int n_dims   = 3;
   int n_transf = 1;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, NULL, nk, s ,t ,u, fk, opts);
+  int type     = 3;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                NULL, nk, s, t, u, fk, opts);
   return ier;
 }
 
-int FINUFFT3D3MANY(int n_transf, BIGINT nj,FLT* xj,FLT* yj,FLT *zj, CPX* cj,
-	       int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t,
-	       FLT *u, CPX* fk, finufft_opts *opts)
+int FINUFFT3D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
+                   FLT eps, BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk,
+                   finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
   int n_dims = 3;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, NULL, nk, s ,t ,u, fk, opts);
+  int type   = 3;
+  int ier    = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                   NULL, nk, s, t, u, fk, opts);
   return ier;
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index 92f4035eb..8df6ed665 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -7,80 +7,80 @@
 #include "finufft/defs.h"
 
 namespace finufft {
-  namespace utils {
+namespace utils {
 
 // ------------ complex array utils ---------------------------------
 
-FLT relerrtwonorm(BIGINT n, CPX* a, CPX* b)
+FLT relerrtwonorm(BIGINT n, CPX *a, CPX *b)
 // ||a-b||_2 / ||a||_2
 {
   FLT err = 0.0, nrm = 0.0;
-  for (BIGINT m=0; m<n; ++m) {
-    nrm += real(conj(a[m])*a[m]);
-    CPX diff = a[m]-b[m];
-    err += real(conj(diff)*diff);
+  for (BIGINT m = 0; m < n; ++m) {
+    nrm += real(conj(a[m]) * a[m]);
+    CPX diff = a[m] - b[m];
+    err += real(conj(diff) * diff);
   }
-  return sqrt(err/nrm);
+  return sqrt(err / nrm);
 }
-FLT errtwonorm(BIGINT n, CPX* a, CPX* b)
+FLT errtwonorm(BIGINT n, CPX *a, CPX *b)
 // ||a-b||_2
 {
-  FLT err = 0.0;   // compute error 2-norm
-  for (BIGINT m=0; m<n; ++m) {
-    CPX diff = a[m]-b[m];
-    err += real(conj(diff)*diff);
+  FLT err = 0.0; // compute error 2-norm
+  for (BIGINT m = 0; m < n; ++m) {
+    CPX diff = a[m] - b[m];
+    err += real(conj(diff) * diff);
   }
   return sqrt(err);
 }
-FLT twonorm(BIGINT n, CPX* a)
+FLT twonorm(BIGINT n, CPX *a)
 // ||a||_2
 {
   FLT nrm = 0.0;
-  for (BIGINT m=0; m<n; ++m)
-    nrm += real(conj(a[m])*a[m]);
+  for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]);
   return sqrt(nrm);
 }
-FLT infnorm(BIGINT n, CPX* a)
+FLT infnorm(BIGINT n, CPX *a)
 // ||a||_infty
 {
   FLT nrm = 0.0;
-  for (BIGINT m=0; m<n; ++m) {
-    FLT aa = real(conj(a[m])*a[m]);
-    if (aa>nrm) nrm = aa;
+  for (BIGINT m = 0; m < n; ++m) {
+    FLT aa = real(conj(a[m]) * a[m]);
+    if (aa > nrm) nrm = aa;
   }
   return sqrt(nrm);
 }
 
 // ------------ real array utils ---------------------------------
 
-void arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi)
+void arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi)
 // With a a length-n array, writes out min(a) to lo and max(a) to hi,
 // so that all a values lie in [lo,hi].
 // If n==0, lo and hi are not finite.
 {
-  *lo = INFINITY; *hi = -INFINITY;
-  for (BIGINT m=0; m<n; ++m) {
-    if (a[m]<*lo) *lo = a[m];
-    if (a[m]>*hi) *hi = a[m];
+  *lo = INFINITY;
+  *hi = -INFINITY;
+  for (BIGINT m = 0; m < n; ++m) {
+    if (a[m] < *lo) *lo = a[m];
+    if (a[m] > *hi) *hi = a[m];
   }
 }
 
-void arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c)
+void arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c)
 // Writes out w = half-width and c = center of an interval enclosing all a[n]'s
 // Only chooses a nonzero center if this increases w by less than fraction
 // ARRAYWIDCEN_GROWFRAC defined in defs.h.
 // This prevents rephasings which don't grow nf by much. 6/8/17
 // If n==0, w and c are not finite.
 {
-  FLT lo,hi;
-  arrayrange(n,a,&lo,&hi);
-  *w = (hi-lo)/2;
-  *c = (hi+lo)/2;
-  if (std::abs(*c)<ARRAYWIDCEN_GROWFRAC*(*w)) {
+  FLT lo, hi;
+  arrayrange(n, a, &lo, &hi);
+  *w = (hi - lo) / 2;
+  *c = (hi + lo) / 2;
+  if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) {
     *w += std::abs(*c);
     *c = 0.0;
   }
 }
 
-  }  // namespace
-}  // namespace
+} // namespace utils
+} // namespace finufft
diff --git a/src/utils_precindep.cpp b/src/utils_precindep.cpp
index 48c7fc0df..194fae7f0 100644
--- a/src/utils_precindep.cpp
+++ b/src/utils_precindep.cpp
@@ -5,38 +5,39 @@
 
 #include <cstdint>
 
-#include "finufft/utils_precindep.h"
 #include "finufft/defs.h"
+#include "finufft/utils_precindep.h"
 using namespace std;
 
 namespace finufft {
-  namespace utils {
+namespace utils {
 
 BIGINT next235even(BIGINT n)
 // finds even integer not less than n, with prime factors no larger than 5
 // (ie, "smooth"). Adapted from fortran in hellskitchen.  Barnett 2/9/17
 // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
 {
-  if (n<=2) return 2;
-  if (n%2 == 1) n+=1;   // even
-  BIGINT nplus = n-2;   // to cancel out the +=2 at start of loop
-  BIGINT numdiv = 2;    // a dummy that is >1
-  while (numdiv>1) {
-    nplus += 2;         // stays even
+  if (n <= 2) return 2;
+  if (n % 2 == 1) n += 1;                // even
+  BIGINT nplus  = n - 2;                 // to cancel out the +=2 at start of loop
+  BIGINT numdiv = 2;                     // a dummy that is >1
+  while (numdiv > 1) {
+    nplus += 2;                          // stays even
     numdiv = nplus;
-    while (numdiv%2 == 0) numdiv /= 2;  // remove all factors of 2,3,5...
-    while (numdiv%3 == 0) numdiv /= 3;
-    while (numdiv%5 == 0) numdiv /= 5;
+    while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5...
+    while (numdiv % 3 == 0) numdiv /= 3;
+    while (numdiv % 5 == 0) numdiv /= 5;
   }
   return nplus;
 }
 
 // ----------------------- helpers for timing (always stay double prec) ------
-  
-void CNTime::start()
-{
+
+void CNTime::start() {
   initial = std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::steady_clock::now().time_since_epoch()).count()*1e-6;
+                std::chrono::steady_clock::now().time_since_epoch())
+                .count() *
+            1e-6;
 }
 
 double CNTime::restart()
@@ -51,12 +52,12 @@ double CNTime::elapsedsec()
 // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
 {
   std::uint64_t now = std::chrono::duration_cast<std::chrono::microseconds>(
-                        std::chrono::steady_clock::now().time_since_epoch()).count();
-  const double nowsec = now*1e-6;
+                          std::chrono::steady_clock::now().time_since_epoch())
+                          .count();
+  const double nowsec = now * 1e-6;
   return nowsec - initial;
 }
 
-
 // -------------------------- openmp helpers -------------------------------
 int get_num_threads_parallel_block()
 // return how many threads an omp parallel block would use.
@@ -72,19 +73,18 @@ int get_num_threads_parallel_block()
   return nth_used;
 }
 
-
 // ---------- thread-safe rand number generator for Windows platform ---------
 // (note this is used by macros in defs.h, and supplied in linux/macosx)
 #ifdef _WIN32
 int rand_r(unsigned int *seedp)
 // Libin Lu, 6/18/20
 {
-    std::random_device rd;
-    std::default_random_engine generator(rd());
-    std::uniform_int_distribution<int> distribution(0,RAND_MAX);
-    return distribution(generator);
+  std::random_device rd;
+  std::default_random_engine generator(rd());
+  std::uniform_int_distribution<int> distribution(0, RAND_MAX);
+  return distribution(generator);
 }
 #endif
 
-  } // namespace
-} // namespace
+} // namespace utils
+} // namespace finufft
diff --git a/test/basicpassfail.cpp b/test/basicpassfail.cpp
index c3648d878..c44487925 100644
--- a/test/basicpassfail.cpp
+++ b/test/basicpassfail.cpp
@@ -6,40 +6,39 @@
 // Simplified from Amit Moscovitz and example1d1. Barnett 11/1/18.
 // Using vectors and default opts, 2/29/20; dual-prec lib 7/3/20.
 
-int main()
-{
-  BIGINT M = 1e3, N = 1e3;   // defaults: M = # srcs, N = # modes out
-  double tol = 1e-5;         // req tol, covers both single & double prec cases
-  int isign = +1;            // exponential sign for NUFFT
-  static const CPX I = CPX(0.0,1.0);  // imaginary unit. Note: avoid (CPX) cast
-  std::vector<CPX> F(N);     // alloc output mode coeffs
+int main() {
+  BIGINT M = 1e3, N = 1e3;            // defaults: M = # srcs, N = # modes out
+  double tol         = 1e-5;          // req tol, covers both single & double prec cases
+  int isign          = +1;            // exponential sign for NUFFT
+  static const CPX I = CPX(0.0, 1.0); // imaginary unit. Note: avoid (CPX) cast
+  std::vector<CPX> F(N);              // alloc output mode coeffs
 
   // Make the input data....................................
-  srand(42);                 // seed
-  std::vector<FLT> x(M);     // NU pts locs
-  std::vector<CPX> c(M);     // strengths 
-  for (BIGINT j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((FLT)rand()/(FLT)RAND_MAX)-1);     // uniform random in [-pi,pi)
-    c[j] = 2*((FLT)rand()/(FLT)RAND_MAX)-1 + I*(2*((FLT)rand()/(FLT)RAND_MAX)-1);
+  srand(42);                                               // seed
+  std::vector<FLT> x(M);                                   // NU pts locs
+  std::vector<CPX> c(M);                                   // strengths
+  for (BIGINT j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] = 2 * ((FLT)rand() / (FLT)RAND_MAX) - 1 +
+           I * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1);
   }
   // Run it (NULL = default opts) .......................................
-  int ier = FINUFFT1D1(M,&x[0],&c[0],isign,tol,N,&F[0],NULL);
-  if (ier!=0) {
-    printf("basicpassfail: finufft1d1 error (ier=%d)!",ier);
+  int ier = FINUFFT1D1(M, &x[0], &c[0], isign, tol, N, &F[0], NULL);
+  if (ier != 0) {
+    printf("basicpassfail: finufft1d1 error (ier=%d)!", ier);
     exit(ier);
   }
   // Check correct math for a single mode...................
-  BIGINT n = (BIGINT)(0.37*N);   // choose some mode near the top (N/2)
-  CPX Ftest = CPX(0.0,0.0);      // crude exact answer & error check...
-  for (BIGINT j=0; j<M; ++j)
-    Ftest += c[j] * exp((FLT)isign*I*(FLT)n*x[j]);
-  BIGINT nout = n+N/2;           // index in output array for freq mode n
-  FLT Finfnrm = 0.0;             // compute inf norm of F...
-  for (int m=0; m<N; ++m) {
-    FLT aF = abs(F[m]);          // note C++ abs complex type, not C fabs(f)
-    if (aF>Finfnrm) Finfnrm=aF;
+  BIGINT n  = (BIGINT)(0.37 * N); // choose some mode near the top (N/2)
+  CPX Ftest = CPX(0.0, 0.0);      // crude exact answer & error check...
+  for (BIGINT j = 0; j < M; ++j) Ftest += c[j] * exp((FLT)isign * I * (FLT)n * x[j]);
+  BIGINT nout = n + N / 2;        // index in output array for freq mode n
+  FLT Finfnrm = 0.0;              // compute inf norm of F...
+  for (int m = 0; m < N; ++m) {
+    FLT aF = abs(F[m]);           // note C++ abs complex type, not C fabs(f)
+    if (aF > Finfnrm) Finfnrm = aF;
   }
-  FLT relerr = abs(F[nout] - Ftest)/Finfnrm;
-  //printf("requested tol %.3g: rel err for one mode %.3g\n",tol,relerr);
-  return (std::isnan(relerr) || relerr > 10.0*tol);    // true reports failure
+  FLT relerr = abs(F[nout] - Ftest) / Finfnrm;
+  // printf("requested tol %.3g: rel err for one mode %.3g\n",tol,relerr);
+  return (std::isnan(relerr) || relerr > 10.0 * tol); // true reports failure
 }
diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu
index bb2d96758..05b62025e 100644
--- a/test/cuda/cufinufft1d_test.cu
+++ b/test/cuda/cufinufft1d_test.cu
@@ -16,190 +16,193 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
+template<typename T>
 int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) {
-    std::cout << std::scientific << std::setprecision(3);
-    int ier;
-
-    thrust::host_vector<T> x(M);
-    thrust::host_vector<thrust::complex<T>> c(M);
-    thrust::host_vector<thrust::complex<T>> fk(N1);
-
-    thrust::device_vector<T> d_x(M);
-    thrust::device_vector<thrust::complex<T>> d_c(M);
-    thrust::device_vector<thrust::complex<T>> d_fk(N1);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
+  std::cout << std::scientific << std::setprecision(3);
+  int ier;
+
+  thrust::host_vector<T> x(M);
+  thrust::host_vector<thrust::complex<T>> c(M);
+  thrust::host_vector<thrust::complex<T>> fk(N1);
+
+  thrust::device_vector<T> d_x(M);
+  thrust::device_vector<thrust::complex<T>> d_c(M);
+  thrust::device_vector<thrust::complex<T>> d_fk(N1);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+  }
+  if (type == 1) {
     for (int i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
-    if (type == 1) {
-        for (int i = 0; i < M; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-    } else if (type == 2) {
-        for (int i = 0; i < N1; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return 1;
+  } else if (type == 2) {
+    for (int i = 0; i < N1; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-
-    d_x = x;
-    if (type == 1)
-        d_c = c;
-    else if (type == 2)
-        d_fk = fk;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    float totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to the test...
-    cufinufft_plan_t<T> *dplan;
-    const int dim = 1;
-
-    // Here we setup our own opts, for gpu_method.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_maxbatchsize = 1;
-
-    int nmodes[3] = {N1, 1, 1};
-    int ntransf = 1;
-    cudaEventRecord(start);
-
-    ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft1d_plan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), NULL, NULL, 0, NULL, NULL, NULL, dplan);
-
-    if (ier != 0) {
-        printf("err: cufinufft_setpts\n");
-        return ier;
-    }
-
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), dplan);
-
-    if (ier != 0) {
-        printf("err: cufinufft1d_exec\n");
-        return ier;
-    }
-
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    float exec_ms = milliseconds;
-    printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    if (ier != 0) {
-        printf("err %d: cufinufft1d_destroy\n", ier);
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    printf("[Method %d] %d U pts to %d NU pts in %.3g s:      %.3g NU pts/s\n", opts.gpu_method, N1, M,
-           totaltime / 1000, M / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
-
-    T rel_error = std::numeric_limits<T>::max();
-    if (type == 1) {
-        fk = d_fk;
-        int nt1 = 0.37 * N1; // choose some mode index to check
-        thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
-        for (int j = 0; j < M; ++j)
-            Ft += c[j] * exp(J * (nt1 * x[j])); // crude direct
-        int it = N1 / 2 + nt1;                  // index in complex F as 1d array
-
-        rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
-        printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", nt1, rel_error);
-    } else if (type == 2) {
-        c = d_c;
-
-        int jt = M / 2; // check arbitrary choice of one targ pt
-        thrust::complex<T> J = thrust::complex<T>(0, iflag);
-        thrust::complex<T> ct = thrust::complex<T>(0, 0);
-        int m = 0;
-        for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-            ct += fk[m++] * exp(J * (m1 * x[jt])); // crude direct
-        rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
-        printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
-    }
-
-    return std::isnan(rel_error) || rel_error > checktol;
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return 1;
+  }
+
+  d_x = x;
+  if (type == 1)
+    d_c = c;
+  else if (type == 2)
+    d_fk = fk;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  float totaltime    = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to the test...
+  cufinufft_plan_t<T> *dplan;
+  const int dim = 1;
+
+  // Here we setup our own opts, for gpu_method.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_maxbatchsize = 1;
+
+  int nmodes[3] = {N1, 1, 1};
+  int ntransf   = 1;
+  cudaEventRecord(start);
+
+  ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft1d_plan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), NULL, NULL, 0, NULL, NULL, NULL,
+                                 dplan);
+
+  if (ier != 0) {
+    printf("err: cufinufft_setpts\n");
+    return ier;
+  }
+
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(),
+                                  (cuda_complex<T> *)d_fk.data().get(), dplan);
+
+  if (ier != 0) {
+    printf("err: cufinufft1d_exec\n");
+    return ier;
+  }
+
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  float exec_ms = milliseconds;
+  printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  if (ier != 0) {
+    printf("err %d: cufinufft1d_destroy\n", ier);
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  printf("[Method %d] %d U pts to %d NU pts in %.3g s:      %.3g NU pts/s\n",
+         opts.gpu_method, N1, M, totaltime / 1000, M / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
+
+  T rel_error = std::numeric_limits<T>::max();
+  if (type == 1) {
+    fk                    = d_fk;
+    int nt1               = 0.37 * N1; // choose some mode index to check
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
+    for (int j = 0; j < M; ++j) Ft += c[j] * exp(J * (nt1 * x[j])); // crude direct
+    int it = N1 / 2 + nt1; // index in complex F as 1d array
+
+    rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", nt1, rel_error);
+  } else if (type == 2) {
+    c = d_c;
+
+    int jt                = M / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> ct = thrust::complex<T>(0, 0);
+    int m                 = 0;
+    for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+      ct += fk[m++] * exp(J * (m1 * x[jt])); // crude direct
+    rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
+    printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
+  }
+
+  return std::isnan(rel_error) || rel_error > checktol;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 8) {
-        fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven\n"
-                        "  type: Type of transform (1, 2)\n"
-                        "  N1: Number of fourier modes\n"
-                        "  M: The number of non-uniform points\n"
-                        "  tol: NUFFT tolerance\n"
-                        "  checktol:  relative error to pass test\n"
-                        "  precision: f or d\n");
-        return 1;
-    }
-    const int method = atoi(argv[1]);
-    const int type = atoi(argv[2]);
-    const int N1 = atof(argv[3]);
-    const int M = atof(argv[4]);
-    const double tol = atof(argv[5]);
-    const double checktol = atof(argv[6]);
-    const int iflag = 1;
-    const char prec = argv[7][0];
-    if (prec == 'f')
-        return run_test<float>(method, type, N1, M, tol, checktol, iflag);
-    else if (prec == 'd')
-        return run_test<double>(method, type, N1, M, tol, checktol, iflag);
-    else
-        return -1;
+  if (argc != 8) {
+    fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n"
+                    "Arguments:\n"
+                    "  method: One of\n"
+                    "    1: nupts driven\n"
+                    "  type: Type of transform (1, 2)\n"
+                    "  N1: Number of fourier modes\n"
+                    "  M: The number of non-uniform points\n"
+                    "  tol: NUFFT tolerance\n"
+                    "  checktol:  relative error to pass test\n"
+                    "  precision: f or d\n");
+    return 1;
+  }
+  const int method      = atoi(argv[1]);
+  const int type        = atoi(argv[2]);
+  const int N1          = atof(argv[3]);
+  const int M           = atof(argv[4]);
+  const double tol      = atof(argv[5]);
+  const double checktol = atof(argv[6]);
+  const int iflag       = 1;
+  const char prec       = argv[7][0];
+  if (prec == 'f')
+    return run_test<float>(method, type, N1, M, tol, checktol, iflag);
+  else if (prec == 'd')
+    return run_test<double>(method, type, N1, M, tol, checktol, iflag);
+  else
+    return -1;
 }
diff --git a/test/cuda/cufinufft2d1nupts_test.cu b/test/cuda/cufinufft2d1nupts_test.cu
index 409c42625..6817712df 100644
--- a/test/cuda/cufinufft2d1nupts_test.cu
+++ b/test/cuda/cufinufft2d1nupts_test.cu
@@ -18,207 +18,213 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
-int run_test(int method) {
-    int N1 = 100;
-    int N2 = 100;
-    int N = N1 * N2;
-    int M1 = N1 * N2;
-    int M2 = 2 * N1 * N2;
-
-    T tol = 1e-5;
-    int iflag = 1;
-
-    std::cout << std::scientific << std::setprecision(3);
-    int ier;
-
-    thrust::host_vector<T> x1(M1), y1(M1);
-    thrust::host_vector<thrust::complex<T>> c1(M1), fk1(N1 * N2);
-    thrust::device_vector<T> d_x1(M1), d_y1(M1);
-    thrust::device_vector<thrust::complex<T>> d_c1(M1), d_fk1(N1 * N2);
-
-    thrust::host_vector<T> x2(M2), y2(M2);
-    thrust::host_vector<thrust::complex<T>> c2(M2), fk2(N1 * N2);
-    thrust::device_vector<T> d_x2(M2), d_y2(M2);
-    thrust::device_vector<thrust::complex<T>> d_c2(M2), d_fk2(N1 * N2);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
-    for (int i = 0; i < M1; i++) {
-        x1[i] = M_PI * randm11(); // x in [-pi,pi)
-        y1[i] = M_PI * randm11();
-        c1[i].real(randm11());
-        c1[i].imag(randm11());
-    }
-
-    for (int i = 0; i < M2; i++) {
-        x2[i] = M_PI * randm11(); // x in [-pi,pi)
-        y2[i] = M_PI * randm11();
-        c2[i].real(randm11());
-        c2[i].imag(randm11());
-    }
-
-    d_x1 = x1;
-    d_y1 = y1;
-    d_c1 = c1;
-    d_x2 = x2;
-    d_y2 = y2;
-    d_c2 = c2;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    float totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to our tests...
-    cufinufft_plan_t<T> *dplan;
-    int dim = 2;
-    int type = 1;
-
-    // Here we setup our own opts, for gpu_method.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_maxbatchsize = 1;
-
-    int nmodes[3];
-    int ntransf = 1;
-
-    nmodes[0] = N1;
-    nmodes[1] = N2;
-    nmodes[2] = 1;
-    cudaEventRecord(start);
-    ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft2d_plan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M1, d_x1.data().get(), d_y1.data().get(), NULL, 0, NULL, NULL, NULL, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts (set 1)\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts (set 1):\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c1.data().get(), (cuda_complex<T> *)d_fk1.data().get(), dplan);
-
-    if (ier != 0) {
-        printf("err: cufinufft2d1_exec (set 1)\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    float exec_ms = milliseconds;
-    printf("[time  ] cufinufft exec (set 1):\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M2, d_x2.data().get(), d_y2.data().get(), NULL, 0, NULL, NULL, NULL, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts (set 2)\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts (set 2):\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c2.data().get(), (cuda_complex<T> *)d_fk2.data().get(), dplan);
-    if (ier != 0) {
-        printf("err: cufinufft2d1_exec (set 2)\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    exec_ms += milliseconds;
-    printf("[time  ] cufinufft exec (set 2):\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    fk1 = d_fk1;
-    fk2 = d_fk2;
-
-    printf("[Method %d] (%d+%d) NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n", opts.gpu_method, M1, M2, N1 * N2,
-           totaltime / 1000, (M1 + M2) / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", (M1 + M2) / exec_ms * 1000);
-
-    int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
-    thrust::complex<T> Ft(0, 0), J(0, iflag);
-    for (int j = 0; j < M1; ++j)
-        Ft += c1[j] * exp(J * (nt1 * x1[j] + nt2 * y1[j])); // crude direct
-    int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);            // index in complex F as 1d array
-
-    printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g (set 1)\n", (int)nt1, (int)nt2,
-           abs(Ft - fk1[it]) / infnorm(N, (std::complex<T> *)fk1.data()));
-    Ft = thrust::complex<T>(0, 0);
-    for (int j = 0; j < M2; ++j)
-        Ft += c2[j] * exp(J * (nt1 * x2[j] + nt2 * y2[j])); // crude direct
-    printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g (set 2)\n", (int)nt1, (int)nt2,
-           abs(Ft - fk2[it]) / infnorm(N, (std::complex<T> *)fk2.data()));
-
-    return 0;
+template<typename T> int run_test(int method) {
+  int N1 = 100;
+  int N2 = 100;
+  int N  = N1 * N2;
+  int M1 = N1 * N2;
+  int M2 = 2 * N1 * N2;
+
+  T tol     = 1e-5;
+  int iflag = 1;
+
+  std::cout << std::scientific << std::setprecision(3);
+  int ier;
+
+  thrust::host_vector<T> x1(M1), y1(M1);
+  thrust::host_vector<thrust::complex<T>> c1(M1), fk1(N1 * N2);
+  thrust::device_vector<T> d_x1(M1), d_y1(M1);
+  thrust::device_vector<thrust::complex<T>> d_c1(M1), d_fk1(N1 * N2);
+
+  thrust::host_vector<T> x2(M2), y2(M2);
+  thrust::host_vector<thrust::complex<T>> c2(M2), fk2(N1 * N2);
+  thrust::device_vector<T> d_x2(M2), d_y2(M2);
+  thrust::device_vector<thrust::complex<T>> d_c2(M2), d_fk2(N1 * N2);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M1; i++) {
+    x1[i] = M_PI * randm11(); // x in [-pi,pi)
+    y1[i] = M_PI * randm11();
+    c1[i].real(randm11());
+    c1[i].imag(randm11());
+  }
+
+  for (int i = 0; i < M2; i++) {
+    x2[i] = M_PI * randm11(); // x in [-pi,pi)
+    y2[i] = M_PI * randm11();
+    c2[i].real(randm11());
+    c2[i].imag(randm11());
+  }
+
+  d_x1 = x1;
+  d_y1 = y1;
+  d_c1 = c1;
+  d_x2 = x2;
+  d_y2 = y2;
+  d_c2 = c2;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  float totaltime    = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to our tests...
+  cufinufft_plan_t<T> *dplan;
+  int dim  = 2;
+  int type = 1;
+
+  // Here we setup our own opts, for gpu_method.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_maxbatchsize = 1;
+
+  int nmodes[3];
+  int ntransf = 1;
+
+  nmodes[0] = N1;
+  nmodes[1] = N2;
+  nmodes[2] = 1;
+  cudaEventRecord(start);
+  ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft2d_plan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M1, d_x1.data().get(), d_y1.data().get(), NULL, 0, NULL,
+                                 NULL, NULL, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts (set 1)\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts (set 1):\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c1.data().get(),
+                                  (cuda_complex<T> *)d_fk1.data().get(), dplan);
+
+  if (ier != 0) {
+    printf("err: cufinufft2d1_exec (set 1)\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  float exec_ms = milliseconds;
+  printf("[time  ] cufinufft exec (set 1):\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M2, d_x2.data().get(), d_y2.data().get(), NULL, 0, NULL,
+                                 NULL, NULL, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts (set 2)\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts (set 2):\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c2.data().get(),
+                                  (cuda_complex<T> *)d_fk2.data().get(), dplan);
+  if (ier != 0) {
+    printf("err: cufinufft2d1_exec (set 2)\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  exec_ms += milliseconds;
+  printf("[time  ] cufinufft exec (set 2):\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  fk1 = d_fk1;
+  fk2 = d_fk2;
+
+  printf("[Method %d] (%d+%d) NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n",
+         opts.gpu_method, M1, M2, N1 * N2, totaltime / 1000,
+         (M1 + M2) / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", (M1 + M2) / exec_ms * 1000);
+
+  int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
+  thrust::complex<T> Ft(0, 0), J(0, iflag);
+  for (int j = 0; j < M1; ++j)
+    Ft += c1[j] * exp(J * (nt1 * x1[j] + nt2 * y1[j])); // crude direct
+  int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);          // index in complex F as 1d array
+
+  printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g (set 1)\n", (int)nt1, (int)nt2,
+         abs(Ft - fk1[it]) / infnorm(N, (std::complex<T> *)fk1.data()));
+  Ft = thrust::complex<T>(0, 0);
+  for (int j = 0; j < M2; ++j)
+    Ft += c2[j] * exp(J * (nt1 * x2[j] + nt2 * y2[j])); // crude direct
+  printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g (set 2)\n", (int)nt1, (int)nt2,
+         abs(Ft - fk2[it]) / infnorm(N, (std::complex<T> *)fk2.data()));
+
+  return 0;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc < 3) {
-        fprintf(stderr, "Usage: cufinufft2d1nupts_test method\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven,\n"
-                        "    2: sub-problem, or\n"
-                        "  precision: f or d\n");
-        return 1;
-    }
-    int method;
-    sscanf(argv[1], "%d", &method);
-    char prec = argv[2][0];
-
-    if (prec == 'f')
-        return run_test<float>(method);
-    else if (prec == 'd')
-        return run_test<double>(method);
-    else
-        fprintf(stderr, "Invalid precision supplied: %s\n", argv[2]);
-
+  if (argc < 3) {
+    fprintf(stderr, "Usage: cufinufft2d1nupts_test method\n"
+                    "Arguments:\n"
+                    "  method: One of\n"
+                    "    1: nupts driven,\n"
+                    "    2: sub-problem, or\n"
+                    "  precision: f or d\n");
     return 1;
+  }
+  int method;
+  sscanf(argv[1], "%d", &method);
+  char prec = argv[2][0];
+
+  if (prec == 'f')
+    return run_test<float>(method);
+  else if (prec == 'd')
+    return run_test<double>(method);
+  else
+    fprintf(stderr, "Invalid precision supplied: %s\n", argv[2]);
+
+  return 1;
 }
diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu
index 371b44b2f..2ce430eb6 100644
--- a/test/cuda/cufinufft2d_test.cu
+++ b/test/cuda/cufinufft2d_test.cu
@@ -17,189 +17,194 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
+template<typename T>
 int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag) {
-    std::cout << std::scientific << std::setprecision(3);
-
-    thrust::host_vector<T> x(M), y(M);
-    thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2);
-
-    thrust::device_vector<T> d_x(M), d_y(M);
-    thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
+  std::cout << std::scientific << std::setprecision(3);
+
+  thrust::host_vector<T> x(M), y(M);
+  thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2);
+
+  thrust::device_vector<T> d_x(M), d_y(M);
+  thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    y[i] = M_PI * randm11();
+  }
+  if (type == 1) {
     for (int i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
-        y[i] = M_PI * randm11();
-    }
-    if (type == 1) {
-        for (int i = 0; i < M; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-    } else if (type == 2) {
-        for (int i = 0; i < N1 * N2; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return 1;
-    }
-
-    d_x = x;
-    d_y = y;
-    if (type == 1)
-        d_c = c;
-    else if (type == 2)
-        d_fk = fk;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    float totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to our tests...
-    cufinufft_plan_t<T> *dplan;
-    const int dim = 2;
-
-    // Here we setup our own opts, for gpu_method.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_maxbatchsize = 1;
-
-    int nmodes[3] = {N1, N2, 1};
-    int ntransf = 1;
-    cudaEventRecord(start);
-    int ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft2d_plan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), nullptr, 0, nullptr, nullptr, nullptr, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts\n");
-        return ier;
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), dplan);
-    if (ier != 0) {
-        printf("err: cufinufft2d1_exec\n");
-        return ier;
+  } else if (type == 2) {
+    for (int i = 0; i < N1 * N2; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    float exec_ms = milliseconds;
-    printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    if (type == 1)
-        fk = d_fk;
-    else if (type == 2)
-        c = d_c;
-
-    printf("[Method %d] %d NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n", opts.gpu_method, M, N1 * N2,
-           totaltime / 1000, M / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
-
-    T rel_error = std::numeric_limits<T>::max();
-    if (type == 1) {
-        const int nt1 = 0.37 * N1;
-        const int nt2 = 0.26 * N2; // choose some mode index to check
-        thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
-        for (int j = 0; j < M; ++j)
-            Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
-        const int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);   // index in complex F as 1d array
-
-        rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
-        printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g\n", nt1, nt2, rel_error);
-    } else if (type == 2) {
-        int jt = M / 2; // check arbitrary choice of one targ pt
-        thrust::complex<T> J = thrust::complex<T>(0, iflag);
-        thrust::complex<T> ct = thrust::complex<T>(0, 0);
-
-        int m = 0;
-        for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
-            for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-                ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
-
-        rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
-        printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
-    }
-
-    return std::isnan(rel_error) || rel_error > checktol;
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return 1;
+  }
+
+  d_x = x;
+  d_y = y;
+  if (type == 1)
+    d_c = c;
+  else if (type == 2)
+    d_fk = fk;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  float totaltime    = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to our tests...
+  cufinufft_plan_t<T> *dplan;
+  const int dim = 2;
+
+  // Here we setup our own opts, for gpu_method.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_maxbatchsize = 1;
+
+  int nmodes[3] = {N1, N2, 1};
+  int ntransf   = 1;
+  cudaEventRecord(start);
+  int ier =
+      cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft2d_plan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), nullptr, 0,
+                                 nullptr, nullptr, nullptr, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(),
+                                  (cuda_complex<T> *)d_fk.data().get(), dplan);
+  if (ier != 0) {
+    printf("err: cufinufft2d1_exec\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  float exec_ms = milliseconds;
+  printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  if (type == 1)
+    fk = d_fk;
+  else if (type == 2)
+    c = d_c;
+
+  printf("[Method %d] %d NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n",
+         opts.gpu_method, M, N1 * N2, totaltime / 1000, M / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
+
+  T rel_error = std::numeric_limits<T>::max();
+  if (type == 1) {
+    const int nt1         = 0.37 * N1;
+    const int nt2         = 0.26 * N2; // choose some mode index to check
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
+    for (int j = 0; j < M; ++j)
+      Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+    const int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+
+    rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g\n", nt1, nt2, rel_error);
+  } else if (type == 2) {
+    int jt                = M / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> ct = thrust::complex<T>(0, 0);
+
+    int m = 0;
+    for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+      for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+        ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+
+    rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
+    printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
+  }
+
+  return std::isnan(rel_error) || rel_error > checktol;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 9) {
-        fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven,\n"
-                        "    2: sub-problem, or\n"
-                        "  type: Type of transform (1, 2)"
-                        "  N1, N2: The size of the 2D array\n"
-                        "  M: The number of non-uniform points\n"
-                        "  tol: NUFFT tolerance\n"
-                        "  checktol: relative error to pass test\n"
-                        "  prec:  'f' or 'd' (float/double)\n");
-        return 1;
-    }
-    const int method = atoi(argv[1]);
-    const int type = atoi(argv[2]);
-    const int N1 = atof(argv[3]);
-    const int N2 = atof(argv[4]);
-    const int M = atof(argv[5]);
-    const double tol = atof(argv[6]);
-    const double checktol = atof(argv[7]);
-    const char prec = argv[8][0];
-    const int iflag = 1;
-
-    if (prec == 'f')
-        return run_test<float>(method, type, N1, N2, M, tol, checktol, iflag);
-    else if (prec == 'd')
-        return run_test<double>(method, type, N1, N2, M, tol, checktol, iflag);
-    else
-        return -1;
+  if (argc != 9) {
+    fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n"
+                    "Arguments:\n"
+                    "  method: One of\n"
+                    "    1: nupts driven,\n"
+                    "    2: sub-problem, or\n"
+                    "  type: Type of transform (1, 2)"
+                    "  N1, N2: The size of the 2D array\n"
+                    "  M: The number of non-uniform points\n"
+                    "  tol: NUFFT tolerance\n"
+                    "  checktol: relative error to pass test\n"
+                    "  prec:  'f' or 'd' (float/double)\n");
+    return 1;
+  }
+  const int method      = atoi(argv[1]);
+  const int type        = atoi(argv[2]);
+  const int N1          = atof(argv[3]);
+  const int N2          = atof(argv[4]);
+  const int M           = atof(argv[5]);
+  const double tol      = atof(argv[6]);
+  const double checktol = atof(argv[7]);
+  const char prec       = argv[8][0];
+  const int iflag       = 1;
+
+  if (prec == 'f')
+    return run_test<float>(method, type, N1, N2, M, tol, checktol, iflag);
+  else if (prec == 'd')
+    return run_test<double>(method, type, N1, N2, M, tol, checktol, iflag);
+  else
+    return -1;
 }
diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu
index 96f3cecf3..0a9e45d00 100644
--- a/test/cuda/cufinufft2dmany_test.cu
+++ b/test/cuda/cufinufft2dmany_test.cu
@@ -17,195 +17,208 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
-int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M, T tol, T checktol, int iflag) {
-    std::cout << std::scientific << std::setprecision(3);
-
-    int ier;
-    const int N = N1 * N2;
-    printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M);
-
-    thrust::host_vector<T> x(M), y(M);
-    thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(ntransf * N1 * N2);
-
-    thrust::device_vector<T> d_x(M), d_y(M);
-    thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(ntransf * N1 * N2);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
-    for (int i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
-        y[i] = M_PI * randm11();
-    }
-    if (type == 1) {
-        for (int i = 0; i < ntransf * M; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-    } else if (type == 2) {
-        for (int i = 0; i < ntransf * N1 * N2; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return 1;
+template<typename T>
+int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M,
+             T tol, T checktol, int iflag) {
+  std::cout << std::scientific << std::setprecision(3);
+
+  int ier;
+  const int N = N1 * N2;
+  printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M);
+
+  thrust::host_vector<T> x(M), y(M);
+  thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(ntransf * N1 * N2);
+
+  thrust::device_vector<T> d_x(M), d_y(M);
+  thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(ntransf * N1 * N2);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    y[i] = M_PI * randm11();
+  }
+  if (type == 1) {
+    for (int i = 0; i < ntransf * M; i++) {
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
-
-    d_x = x;
-    d_y = y;
-    if (type == 1)
-        d_c = c;
-    else if (type == 2)
-        d_fk = fk;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    double totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  } else if (type == 2) {
+    for (int i = 0; i < ntransf * N1 * N2; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to the test...
-    cufinufft_plan_t<T> *dplan;
-    int dim = 2;
-
-    // Here we setup our own opts, for gpu_method.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_maxbatchsize = maxbatchsize;
-
-    int nmodes[3] = {N1, N2, 1};
-    cudaEventRecord(start);
-    ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft2d_plan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), NULL, 0, NULL, NULL, NULL, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), dplan);
-    if (ier != 0) {
-        printf("err: cufinufft2d_exec\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    float exec_ms = milliseconds;
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    if (type == 1)
-        fk = d_fk;
-    else if (type == 2)
-        c = d_c;
-
-    T rel_error = std::numeric_limits<T>::max();
-    if (type == 1) {
-        int i = ntransf - 1;                                // // choose some data to check
-        int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
-        thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
-        for (int j = 0; j < M; ++j)
-            Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
-        int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);                 // index in complex F as 1d array
-        rel_error = abs(Ft - fk[it + i * N]) / infnorm(N1, (std::complex<T> *)fk.data() + i * N);
-        printf("[gpu   ] %dth data one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2, rel_error);
-    } else if (type == 2) {
-        const int t = ntransf - 1;
-        thrust::complex<T> *fkstart = fk.data() + t * N1 * N2;
-        const thrust::complex<T> *cstart = c.data() + t * M;
-        const int jt = M / 2; // check arbitrary choice of one targ pt
-        const thrust::complex<T> J(0, iflag);
-        thrust::complex<T> ct(0, 0);
-        int m = 0;
-        for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
-            for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-                ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
-
-        rel_error = abs(cstart[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
-        printf("[gpu   ] %dth data one targ: rel err in c[%d] is %.3g\n", t, jt, rel_error);
-    }
-
-    printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime * 1000, M * ntransf / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M * ntransf / exec_ms * 1000);
-    return std::isnan(rel_error) || rel_error > checktol;
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return 1;
+  }
+
+  d_x = x;
+  d_y = y;
+  if (type == 1)
+    d_c = c;
+  else if (type == 2)
+    d_fk = fk;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  double totaltime   = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to the test...
+  cufinufft_plan_t<T> *dplan;
+  int dim = 2;
+
+  // Here we setup our own opts, for gpu_method.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_maxbatchsize = maxbatchsize;
+
+  int nmodes[3] = {N1, N2, 1};
+  cudaEventRecord(start);
+  ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft2d_plan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), NULL, 0, NULL,
+                                 NULL, NULL, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(),
+                                  (cuda_complex<T> *)d_fk.data().get(), dplan);
+  if (ier != 0) {
+    printf("err: cufinufft2d_exec\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float exec_ms = milliseconds;
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  if (type == 1)
+    fk = d_fk;
+  else if (type == 2)
+    c = d_c;
+
+  T rel_error = std::numeric_limits<T>::max();
+  if (type == 1) {
+    int i   = ntransf - 1;                              // // choose some data to check
+    int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
+    for (int j = 0; j < M; ++j)
+      Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+    int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+    rel_error =
+        abs(Ft - fk[it + i * N]) / infnorm(N1, (std::complex<T> *)fk.data() + i * N);
+    printf("[gpu   ] %dth data one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2,
+           rel_error);
+  } else if (type == 2) {
+    const int t                      = ntransf - 1;
+    thrust::complex<T> *fkstart      = fk.data() + t * N1 * N2;
+    const thrust::complex<T> *cstart = c.data() + t * M;
+    const int jt                     = M / 2; // check arbitrary choice of one targ pt
+    const thrust::complex<T> J(0, iflag);
+    thrust::complex<T> ct(0, 0);
+    int m = 0;
+    for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+      for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+        ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+
+    rel_error = abs(cstart[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
+    printf("[gpu   ] %dth data one targ: rel err in c[%d] is %.3g\n", t, jt, rel_error);
+  }
+
+  printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime * 1000,
+         M * ntransf / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",
+         M * ntransf / exec_ms * 1000);
+  return std::isnan(rel_error) || rel_error > checktol;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 11) {
-        fprintf(stderr, "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M tol checktol prec\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven,\n"
-                        "    2: sub-problem, or\n"
-                        "  type: Type of transform (1, 2)\n"
-                        "  N1, N2: The size of the 2D array\n"
-                        "  ntransf: Number of inputs\n"
-                        "  maxbatchsize: Number of simultaneous transforms (or 0 for default)\n"
-                        "  M: The number of non-uniform points\n"
-                        "  tol: NUFFT tolerance\n"
-                        "  checktol: relative error to pass test\n"
-                        "  prec:  'f' or 'd' (float/double)\n");
-        return 1;
-    }
-    const int method = atoi(argv[1]);
-    const int type = atoi(argv[2]);
-    const int N1 = atof(argv[3]);
-    const int N2 = atof(argv[4]);
-    const int ntransf = atof(argv[5]);
-    const int maxbatchsize = atoi(argv[6]);
-    const int M = atoi(argv[7]);
-    const double tol = atof(argv[8]);
-    const double checktol = atof(argv[9]);
-    const char prec = argv[10][0];
-    const int iflag = 1;
-
-    if (prec == 'f')
-        return run_test<float>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, iflag);
-    else if (prec == 'd')
-        return run_test<double>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, iflag);
-    else
-        return -1;
+  if (argc != 11) {
+    fprintf(stderr,
+            "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M tol "
+            "checktol prec\n"
+            "Arguments:\n"
+            "  method: One of\n"
+            "    1: nupts driven,\n"
+            "    2: sub-problem, or\n"
+            "  type: Type of transform (1, 2)\n"
+            "  N1, N2: The size of the 2D array\n"
+            "  ntransf: Number of inputs\n"
+            "  maxbatchsize: Number of simultaneous transforms (or 0 for default)\n"
+            "  M: The number of non-uniform points\n"
+            "  tol: NUFFT tolerance\n"
+            "  checktol: relative error to pass test\n"
+            "  prec:  'f' or 'd' (float/double)\n");
+    return 1;
+  }
+  const int method       = atoi(argv[1]);
+  const int type         = atoi(argv[2]);
+  const int N1           = atof(argv[3]);
+  const int N2           = atof(argv[4]);
+  const int ntransf      = atof(argv[5]);
+  const int maxbatchsize = atoi(argv[6]);
+  const int M            = atoi(argv[7]);
+  const double tol       = atof(argv[8]);
+  const double checktol  = atof(argv[9]);
+  const char prec        = argv[10][0];
+  const int iflag        = 1;
+
+  if (prec == 'f')
+    return run_test<float>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol,
+                           iflag);
+  else if (prec == 'd')
+    return run_test<double>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol,
+                            iflag);
+  else
+    return -1;
 }
diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu
index a882f6715..ddca0fd61 100644
--- a/test/cuda/cufinufft3d_test.cu
+++ b/test/cuda/cufinufft3d_test.cu
@@ -17,198 +17,210 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
-int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol, int iflag) {
-    std::cout << std::scientific << std::setprecision(3);
-    int ier;
-
-    thrust::host_vector<T> x(M), y(M), z(M);
-    thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2 * N3);
-
-    thrust::device_vector<T> d_x(M), d_y(M), d_z(M);
-    thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2 * N3);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
+template<typename T>
+int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol,
+             int iflag) {
+  std::cout << std::scientific << std::setprecision(3);
+  int ier;
+
+  thrust::host_vector<T> x(M), y(M), z(M);
+  thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2 * N3);
+
+  thrust::device_vector<T> d_x(M), d_y(M), d_z(M);
+  thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2 * N3);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    y[i] = M_PI * randm11();
+    z[i] = M_PI * randm11();
+  }
+  if (type == 1) {
     for (int i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
-        y[i] = M_PI * randm11();
-        z[i] = M_PI * randm11();
-    }
-    if (type == 1) {
-        for (int i = 0; i < M; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-    } else if (type == 2) {
-        for (int i = 0; i < N1 * N2 * N3; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return 1;
-    }
-
-    d_x = x;
-    d_y = y;
-    d_z = z;
-
-    if (type == 1)
-        d_c = c;
-    else if (type == 2)
-        d_fk = fk;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    float totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to the test...
-    cufinufft_plan_t<T> *dplan;
-    int dim = 3;
-
-    // Here we setup our own opts, for gpu_method and gpu_kerevalmeth.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_kerevalmeth = 1;
-    opts.gpu_maxbatchsize = 1;
-
-    int nmodes[3] = {N1, N2, N3};
-    int ntransf = 1;
-
-    cudaEventRecord(start);
-    ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft_makeplan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), 0, nullptr, nullptr,
-                                   nullptr, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts\n");
-        return ier;
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_execute\n");
-        return ier;
+  } else if (type == 2) {
+    for (int i = 0; i < N1 * N2 * N3; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    float exec_ms = milliseconds;
-    printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    if (type == 1)
-        fk = d_fk;
-    else if (type == 2)
-        c = d_c;
-
-    printf("[Method %d] %d NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n", opts.gpu_method, M, N1 * N2 * N3,
-           totaltime / 1000, M / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
-
-    T rel_error = std::numeric_limits<T>::max();
-    if (type == 1) {
-        int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2), nt3 = (int)(0.13 * N3); // choose some mode index to check
-        thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
-        for (int j = 0; j < M; ++j)
-            Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j])); // crude direct
-
-        int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3); // index in complex F as 1d array
-        rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
-        printf("[gpu   ] one mode: rel err in F[%d,%d,%d] is %.3g\n", nt1, nt2, nt3, rel_error);
-    } else if (type == 2) {
-        int jt = M / 2; // check arbitrary choice of one targ pt
-        thrust::complex<T> J = thrust::complex<T>(0, iflag);
-        thrust::complex<T> ct = thrust::complex<T>(0, 0);
-
-        int m = 0;
-        for (int m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3)     // loop in correct order over F
-            for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
-                for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-                    ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude direct
-
-        rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
-        printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n", (int64_t)jt, rel_error);
-    }
-
-    return std::isnan(rel_error) || rel_error > checktol;
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return 1;
+  }
+
+  d_x = x;
+  d_y = y;
+  d_z = z;
+
+  if (type == 1)
+    d_c = c;
+  else if (type == 2)
+    d_fk = fk;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  float totaltime    = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to the test...
+  cufinufft_plan_t<T> *dplan;
+  int dim = 3;
+
+  // Here we setup our own opts, for gpu_method and gpu_kerevalmeth.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_kerevalmeth  = 1;
+  opts.gpu_maxbatchsize = 1;
+
+  int nmodes[3] = {N1, N2, N3};
+  int ntransf   = 1;
+
+  cudaEventRecord(start);
+  ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft_makeplan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), d_z.data().get(),
+                                 0, nullptr, nullptr, nullptr, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(),
+                                  (cuda_complex<T> *)d_fk.data().get(), dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_execute\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  float exec_ms = milliseconds;
+  printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  if (type == 1)
+    fk = d_fk;
+  else if (type == 2)
+    c = d_c;
+
+  printf("[Method %d] %d NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n", opts.gpu_method,
+         M, N1 * N2 * N3, totaltime / 1000, M / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
+
+  T rel_error = std::numeric_limits<T>::max();
+  if (type == 1) {
+    int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2), nt3 = (int)(0.13 * N3); // choose
+                                                                                // some
+                                                                                // mode
+                                                                                // index
+                                                                                // to
+                                                                                // check
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
+    for (int j = 0; j < M; ++j)
+      Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j])); // crude direct
+
+    int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3); // index in
+                                                                            // complex F
+                                                                            // as 1d array
+    rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d,%d,%d] is %.3g\n", nt1, nt2, nt3,
+           rel_error);
+  } else if (type == 2) {
+    int jt                = M / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> ct = thrust::complex<T>(0, 0);
+
+    int m = 0;
+    for (int m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3)   // loop in correct order over F
+      for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+        for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+          ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude direct
+
+    rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
+    printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n", (int64_t)jt, rel_error);
+  }
+
+  return std::isnan(rel_error) || rel_error > checktol;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc < 10) {
-        fprintf(stderr, "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven,\n"
-                        "    2: sub-problem, or\n"
-                        "    4: block gather.\n"
-                        "  type: Type of transform (1, 2)"
-                        "  N1, N2, N3: The size of the 3D array\n"
-                        "  M: The number of non-uniform points\n"
-                        "  tol: NUFFT tolerance\n"
-                        "  checktol:  relative error to pass test\n"
-                        "  prec:  'f' or 'd' (float/double)\n");
-        return 1;
-    }
-    const int method = atoi(argv[1]);
-    const int type = atoi(argv[2]);
-    const int N1 = atof(argv[3]);
-    const int N2 = atof(argv[4]);
-    const int N3 = atof(argv[5]);
-    const int M = atof(argv[6]);
-    const double tol = atof(argv[7]);
-    const double checktol = atof(argv[8]);
-    const char prec = argv[9][0];
-    const int iflag = 1;
-
-    if (prec == 'f')
-        return run_test<float>(method, type, N1, N2, N3, M, tol, checktol, iflag);
-    else if (prec == 'd')
-        return run_test<double>(method, type, N1, N2, N3, M, tol, checktol, iflag);
-    else
-        return -1;
+  if (argc < 10) {
+    fprintf(stderr, "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n"
+                    "Arguments:\n"
+                    "  method: One of\n"
+                    "    1: nupts driven,\n"
+                    "    2: sub-problem, or\n"
+                    "    4: block gather.\n"
+                    "  type: Type of transform (1, 2)"
+                    "  N1, N2, N3: The size of the 3D array\n"
+                    "  M: The number of non-uniform points\n"
+                    "  tol: NUFFT tolerance\n"
+                    "  checktol:  relative error to pass test\n"
+                    "  prec:  'f' or 'd' (float/double)\n");
+    return 1;
+  }
+  const int method      = atoi(argv[1]);
+  const int type        = atoi(argv[2]);
+  const int N1          = atof(argv[3]);
+  const int N2          = atof(argv[4]);
+  const int N3          = atof(argv[5]);
+  const int M           = atof(argv[6]);
+  const double tol      = atof(argv[7]);
+  const double checktol = atof(argv[8]);
+  const char prec       = argv[9][0];
+  const int iflag       = 1;
+
+  if (prec == 'f')
+    return run_test<float>(method, type, N1, N2, N3, M, tol, checktol, iflag);
+  else if (prec == 'd')
+    return run_test<double>(method, type, N1, N2, N3, M, tol, checktol, iflag);
+  else
+    return -1;
 }
diff --git a/test/cuda/fseries_kernel_test.cu b/test/cuda/fseries_kernel_test.cu
index 7e1a5f728..7f18ee21c 100644
--- a/test/cuda/fseries_kernel_test.cu
+++ b/test/cuda/fseries_kernel_test.cu
@@ -13,155 +13,146 @@ using namespace cufinufft::common;
 using namespace cufinufft::spreadinterp;
 using namespace cufinufft::utils;
 
-template <typename T>
-int run_test(int nf1, int dim, T eps, int gpu, int nf2, int nf3) {
+template<typename T> int run_test(int nf1, int dim, T eps, int gpu, int nf2, int nf3) {
 
-    finufft_spread_opts opts;
-    T *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
-    T *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3;
-    checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1)));
-    if (dim > 1)
-        checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1)));
-    if (dim > 2)
-        checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1)));
+  finufft_spread_opts opts;
+  T *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
+  T *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3;
+  checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1)));
+  if (dim > 1) checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1)));
+  if (dim > 2) checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1)));
 
-    int ier = setup_spreader(opts, (T)eps, (T)2.0, 0);
+  int ier = setup_spreader(opts, (T)eps, (T)2.0, 0);
 
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
 
-    float milliseconds = 0;
-    float gputime = 0;
-    float cputime = 0;
+  float milliseconds = 0;
+  float gputime      = 0;
+  float cputime      = 0;
 
-    CNTime timer;
-    if (!gpu) {
-        timer.start();
-        fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
-        if (dim > 1)
-            fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
-        if (dim > 2)
-            fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
+  CNTime timer;
+  if (!gpu) {
+    timer.start();
+    fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
+    if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
+    if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
 
-        onedim_fseries_kernel(nf1, fwkerhalf1, opts);
-        if (dim > 1)
-            onedim_fseries_kernel(nf2, fwkerhalf2, opts);
-        if (dim > 2)
-            onedim_fseries_kernel(nf3, fwkerhalf3, opts);
-        cputime = timer.elapsedsec();
-        cudaEventRecord(start);
-        {
-            checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), cudaMemcpyHostToDevice));
-            if (dim > 1)
-                checkCudaErrors(
-                    cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), cudaMemcpyHostToDevice));
-            if (dim > 2)
-                checkCudaErrors(
-                    cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), cudaMemcpyHostToDevice));
-        }
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&milliseconds, start, stop);
-        gputime = milliseconds;
-        printf("[time  ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, opts.nspread, gputime + cputime * 1000);
-        free(fwkerhalf1);
-        if (dim > 1)
-            free(fwkerhalf2);
-        if (dim > 2)
-            free(fwkerhalf3);
-    } else {
-        timer.start();
-        std::complex<double> a[dim * MAX_NQUAD];
-        T f[dim * MAX_NQUAD];
-        onedim_fseries_kernel_precomp(nf1, f, a, opts);
-        if (dim > 1)
-            onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts);
-        if (dim > 2)
-            onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts);
-        cputime = timer.elapsedsec();
+    onedim_fseries_kernel(nf1, fwkerhalf1, opts);
+    if (dim > 1) onedim_fseries_kernel(nf2, fwkerhalf2, opts);
+    if (dim > 2) onedim_fseries_kernel(nf3, fwkerhalf3, opts);
+    cputime = timer.elapsedsec();
+    cudaEventRecord(start);
+    {
+      checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + 1),
+                                 cudaMemcpyHostToDevice));
+      if (dim > 1)
+        checkCudaErrors(cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + 1),
+                                   cudaMemcpyHostToDevice));
+      if (dim > 2)
+        checkCudaErrors(cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + 1),
+                                   cudaMemcpyHostToDevice));
+    }
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    gputime = milliseconds;
+    printf("[time  ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, opts.nspread,
+           gputime + cputime * 1000);
+    free(fwkerhalf1);
+    if (dim > 1) free(fwkerhalf2);
+    if (dim > 2) free(fwkerhalf3);
+  } else {
+    timer.start();
+    std::complex<double> a[dim * MAX_NQUAD];
+    T f[dim * MAX_NQUAD];
+    onedim_fseries_kernel_precomp(nf1, f, a, opts);
+    if (dim > 1) onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts);
+    if (dim > 2)
+      onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts);
+    cputime = timer.elapsedsec();
 
-        cuDoubleComplex *d_a;
-        T *d_f;
-        cudaEventRecord(start);
-        {
-            checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)));
-            checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)));
-            checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
-            checkCudaErrors(cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice));
-            ier = cufserieskernelcompute(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3,
-                                         opts.nspread, cudaStreamDefault);
-        }
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&milliseconds, start, stop);
-        gputime = milliseconds;
-        printf("[time  ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", dim, nf1, opts.nspread, gputime + cputime * 1000);
-        cudaFree(d_a);
-        cudaFree(d_f);
+    cuDoubleComplex *d_a;
+    T *d_f;
+    cudaEventRecord(start);
+    {
+      checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)));
+      checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)));
+      checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
+                                 cudaMemcpyHostToDevice));
+      checkCudaErrors(
+          cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice));
+      ier =
+          cufserieskernelcompute(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2,
+                                 d_fwkerhalf3, opts.nspread, cudaStreamDefault);
     }
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    gputime = milliseconds;
+    printf("[time  ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", dim, nf1, opts.nspread,
+           gputime + cputime * 1000);
+    cudaFree(d_a);
+    cudaFree(d_f);
+  }
 
-    fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
-    if (dim > 1)
-        fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
-    if (dim > 2)
-        fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
+  fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
+  if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
+  if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
 
-    checkCudaErrors(cudaMemcpy(fwkerhalf1, d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), cudaMemcpyDeviceToHost));
-    if (dim > 1)
-        checkCudaErrors(cudaMemcpy(fwkerhalf2, d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), cudaMemcpyDeviceToHost));
-    if (dim > 2)
-        checkCudaErrors(cudaMemcpy(fwkerhalf3, d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), cudaMemcpyDeviceToHost));
-    for (int i = 0; i < nf1 / 2 + 1; i++)
-        printf("%10.8e ", fwkerhalf1[i]);
-    printf("\n");
-    if (dim > 1)
-        for (int i = 0; i < nf2 / 2 + 1; i++)
-            printf("%10.8e ", fwkerhalf2[i]);
-    printf("\n");
-    if (dim > 2)
-        for (int i = 0; i < nf3 / 2 + 1; i++)
-            printf("%10.8e ", fwkerhalf3[i]);
-    printf("\n");
+  checkCudaErrors(cudaMemcpy(fwkerhalf1, d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1),
+                             cudaMemcpyDeviceToHost));
+  if (dim > 1)
+    checkCudaErrors(cudaMemcpy(fwkerhalf2, d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1),
+                               cudaMemcpyDeviceToHost));
+  if (dim > 2)
+    checkCudaErrors(cudaMemcpy(fwkerhalf3, d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1),
+                               cudaMemcpyDeviceToHost));
+  for (int i = 0; i < nf1 / 2 + 1; i++) printf("%10.8e ", fwkerhalf1[i]);
+  printf("\n");
+  if (dim > 1)
+    for (int i = 0; i < nf2 / 2 + 1; i++) printf("%10.8e ", fwkerhalf2[i]);
+  printf("\n");
+  if (dim > 2)
+    for (int i = 0; i < nf3 / 2 + 1; i++) printf("%10.8e ", fwkerhalf3[i]);
+  printf("\n");
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc < 3) {
-        fprintf(stderr, "Usage: onedim_fseries_kernel_test prec nf1 [dim [tol [gpuversion [nf2 [nf3]]]]]\n"
-                        "Arguments:\n"
-                        "  prec: 'f' or 'd' (float/double)\n"
-                        "  nf1: The size of the upsampled fine grid size in x.\n"
-                        "  dim: Dimension of the nuFFT.\n"
-                        "  tol: NUFFT tolerance (default 1e-6).\n"
-                        "  gpuversion: Use gpu version or not (default True).\n"
-                        "  nf2: The size of the upsampled fine grid size in y. (default nf1)\n"
-                        "  nf3: The size of the upsampled fine grid size in z. (default nf3)\n");
-        return 1;
-    }
-    char prec = argv[1][0];
-    int nf1 = std::atof(argv[2]);
-    int dim = 1;
-    double eps = 1e-6;
-    int gpu = 1;
-    int nf2 = nf1;
-    int nf3 = nf1;
-    if (argc > 3)
-        dim = std::atoi(argv[3]);
-    if (argc > 4)
-        eps = std::atof(argv[4]);
-    if (argc > 5)
-        gpu = std::atoi(argv[5]);
-    if (argc > 6)
-        nf2 = std::atoi(argv[6]);
-    if (argc > 7)
-        nf3 = std::atoi(argv[7]);
+  if (argc < 3) {
+    fprintf(stderr,
+            "Usage: onedim_fseries_kernel_test prec nf1 [dim [tol [gpuversion [nf2 "
+            "[nf3]]]]]\n"
+            "Arguments:\n"
+            "  prec: 'f' or 'd' (float/double)\n"
+            "  nf1: The size of the upsampled fine grid size in x.\n"
+            "  dim: Dimension of the nuFFT.\n"
+            "  tol: NUFFT tolerance (default 1e-6).\n"
+            "  gpuversion: Use gpu version or not (default True).\n"
+            "  nf2: The size of the upsampled fine grid size in y. (default nf1)\n"
+            "  nf3: The size of the upsampled fine grid size in z. (default nf3)\n");
+    return 1;
+  }
+  char prec  = argv[1][0];
+  int nf1    = std::atof(argv[2]);
+  int dim    = 1;
+  double eps = 1e-6;
+  int gpu    = 1;
+  int nf2    = nf1;
+  int nf3    = nf1;
+  if (argc > 3) dim = std::atoi(argv[3]);
+  if (argc > 4) eps = std::atof(argv[4]);
+  if (argc > 5) gpu = std::atoi(argv[5]);
+  if (argc > 6) nf2 = std::atoi(argv[6]);
+  if (argc > 7) nf3 = std::atoi(argv[7]);
 
-    if (prec == 'f')
-        return run_test<float>(nf1, dim, eps, gpu, nf2, nf3);
-    else if (prec == 'd')
-        return run_test<double>(nf1, dim, eps, gpu, nf2, nf3);
-    else
-        return -1;
+  if (prec == 'f')
+    return run_test<float>(nf1, dim, eps, gpu, nf2, nf3);
+  else if (prec == 'd')
+    return run_test<double>(nf1, dim, eps, gpu, nf2, nf3);
+  else
+    return -1;
 }
diff --git a/test/directft/dirft1d.cpp b/test/directft/dirft1d.cpp
index a52d826c4..5f36d76d7 100644
--- a/test/directft/dirft1d.cpp
+++ b/test/directft/dirft1d.cpp
@@ -1,14 +1,14 @@
-#include <finufft/dirft.h>
 #include <finufft/defs.h>
+#include <finufft/dirft.h>
 #include <iostream>
 
 // This is basically a port of dirft1d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft1d1(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f)
+void dirft1d1(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f)
 /* Direct computation of 1D type-1 nonuniform FFT. Interface same as finufft1d1.
 c                  nj-1
-c     f[k1]    =   SUM  c[j] exp(+-i k1 x[j]) 
+c     f[k1]    =   SUM  c[j] exp(+-i k1 x[j])
 c                  j=0
 c
 c     for -ms/2 <= k1 <= (ms-1)/2.
@@ -17,24 +17,24 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 1/25/17
 */
 {
-  BIGINT kmin = -(ms/2);                   // integer divide
-  for (BIGINT m=0;m<ms;++m) f[m] = CPX(0,0);   // it knows f is complex type
-  for (BIGINT j=0;j<nj;++j) {
-    CPX a = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX p = pow(a,(FLT)kmin);   // starting phase for most neg freq
+  BIGINT kmin = -(ms / 2);                          // integer divide
+  for (BIGINT m = 0; m < ms; ++m) f[m] = CPX(0, 0); // it knows f is complex type
+  for (BIGINT j = 0; j < nj; ++j) {
+    CPX a  = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX p  = pow(a, (FLT)kmin); // starting phase for most neg freq
     CPX cc = c[j];              // no 1/nj prefac
-    for (BIGINT m=0;m<ms;++m) {
+    for (BIGINT m = 0; m < ms; ++m) {
       f[m] += cc * p;
       p *= a;
     }
   }
 }
 
-void dirft1d2(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f)
+void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f)
 /* Direct computation of 1D type-2 nonuniform FFT. Interface same as finufft1d2
 c
-c     c[j] = SUM   f[k1] exp(+-i k1 x[j]) 
-c             k1  
+c     c[j] = SUM   f[k1] exp(+-i k1 x[j])
+c             k1
 c                            for j = 0,...,nj-1
 c     where sum is over -ms/2 <= k1 <= (ms-1)/2.
 c     The input array is in increasing k1 ordering. If iflag>0 the + sign is
@@ -42,12 +42,12 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 1/25/17
 */
 {
-  BIGINT kmin = -(ms/2);                     // integer divide
-  for (BIGINT j=0;j<nj;++j) {
-    CPX a = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX p = pow(a,(FLT)kmin);   // starting phase for most neg freq
-    CPX cc = CPX(0,0);
-    for (BIGINT m=0;m<ms;++m) {
+  BIGINT kmin = -(ms / 2); // integer divide
+  for (BIGINT j = 0; j < nj; ++j) {
+    CPX a  = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX p  = pow(a, (FLT)kmin); // starting phase for most neg freq
+    CPX cc = CPX(0, 0);
+    for (BIGINT m = 0; m < ms; ++m) {
       cc += f[m] * p;
       p *= a;
     }
@@ -55,20 +55,19 @@ c     used, otherwise the - sign is used, in the exponential.
   }
 }
 
-void dirft1d3(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT nk, FLT* s, CPX* f)
+void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f)
 /* Direct computation of 1D type-3 nonuniform FFT. Interface same as finufft1d3
 c              nj-1
-c     f[k]  =  SUM   c[j] exp(+-i s[k] x[j]) 
-c              j=0                   
+c     f[k]  =  SUM   c[j] exp(+-i s[k] x[j])
+c              j=0
 c                    for k = 0, ..., nk-1
 c  If iflag>0 the + sign is used, otherwise the - sign is used, in the
 c  exponential. Uses C++ complex type. Simple brute force.  Barnett 1/25/17
 */
 {
-  for (BIGINT k=0;k<nk;++k) {
-    CPX ss = (iflag>0) ? IMA*s[k] : -IMA*s[k];
-    f[k] = CPX(0,0);
-    for (BIGINT j=0;j<nj;++j)
-      f[k] += c[j] * exp(ss*x[j]);
+  for (BIGINT k = 0; k < nk; ++k) {
+    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
+    f[k]   = CPX(0, 0);
+    for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j]);
   }
 }
diff --git a/test/directft/dirft2d.cpp b/test/directft/dirft2d.cpp
index 4f91141f6..c13661549 100644
--- a/test/directft/dirft2d.cpp
+++ b/test/directft/dirft2d.cpp
@@ -1,11 +1,11 @@
-#include <finufft/dirft.h>
 #include <finufft/defs.h>
+#include <finufft/dirft.h>
 #include <iostream>
 
 // This is basically a port of dirft2d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f)
+void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f)
 /* Direct computation of 2D type-1 nonuniform FFT. Interface same as finufft2d1.
 c                  nj-1
 c     f[k1,k2] =   SUM  c[j] exp(+-i (k1 x[j] + k2 y[j]))
@@ -18,32 +18,32 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 1/26/17
 */
 {
-  BIGINT k1min = -(ms/2), k2min = -(mt/2);                 // integer divide
-  BIGINT N = ms*mt;        // total # output modes
-  for (BIGINT m=0;m<N;++m) f[m] = CPX(0,0);    // it knows f is complex type
-  for (BIGINT j=0;j<nj;++j) {            // src pts
-    CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
-    CPX sp1 = pow(a1,(FLT)k1min);  // starting phase for most neg k1 freq  
-    CPX p2 = pow(a2,(FLT)k2min);
-    CPX cc = c[j];                 // no 1/nj norm
-    BIGINT m=0;      // output pointer
-    for (BIGINT m2=0;m2<mt;++m2) {
-      CPX p1 = sp1;                // must reset p1 for each inner loop
-      for (BIGINT m1=0;m1<ms;++m1) {  // ms is fast, mt slow
-	f[m++] += cc * p1 * p2;
-	p1 *= a1;
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2);     // integer divide
+  BIGINT N = ms * mt;                              // total # output modes
+  for (BIGINT m = 0; m < N; ++m) f[m] = CPX(0, 0); // it knows f is complex type
+  for (BIGINT j = 0; j < nj; ++j) {                // src pts
+    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
+    CPX sp1  = pow(a1, (FLT)k1min);        // starting phase for most neg k1 freq
+    CPX p2   = pow(a2, (FLT)k2min);
+    CPX cc   = c[j];                       // no 1/nj norm
+    BIGINT m = 0;                          // output pointer
+    for (BIGINT m2 = 0; m2 < mt; ++m2) {
+      CPX p1 = sp1;                        // must reset p1 for each inner loop
+      for (BIGINT m1 = 0; m1 < ms; ++m1) { // ms is fast, mt slow
+        f[m++] += cc * p1 * p2;
+        p1 *= a1;
       }
       p2 *= a2;
     }
   }
 }
 
-void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f)
+void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f)
 /* Direct computation of 2D type-2 nonuniform FFT. Interface same as finufft2d2
 
-     c[j] = SUM   f[k1,k2] exp(+-i (k1 x[j] + k2 y[j])) 
-            k1,k2  
+     c[j] = SUM   f[k1,k2] exp(+-i (k1 x[j] + k2 y[j]))
+            k1,k2
                             for j = 0,...,nj-1
     where sum is over -ms/2 <= k1 <= (ms-1)/2,  -mt/2 <= k2 <= (mt-1)/2.
 
@@ -54,19 +54,19 @@ void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX
     Uses C++ complex type and winding trick.  Barnett 1/26/17
 */
 {
-  BIGINT k1min = -(ms/2), k2min = -(mt/2);                 // integer divide
-  for (BIGINT j=0;j<nj;++j) {
-    CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
-    CPX sp1 = pow(a1,(FLT)k1min);
-    CPX p2 = pow(a2,(FLT)k2min);
-    CPX cc = CPX(0,0);
-    BIGINT m=0;      // input pointer
-    for (BIGINT m2=0;m2<mt;++m2) {
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2); // integer divide
+  for (BIGINT j = 0; j < nj; ++j) {
+    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
+    CPX sp1  = pow(a1, (FLT)k1min);
+    CPX p2   = pow(a2, (FLT)k2min);
+    CPX cc   = CPX(0, 0);
+    BIGINT m = 0; // input pointer
+    for (BIGINT m2 = 0; m2 < mt; ++m2) {
       CPX p1 = sp1;
-      for (BIGINT m1=0;m1<ms;++m1) {
-	cc += f[m++] * p1 * p2;
-	p1 *= a1;
+      for (BIGINT m1 = 0; m1 < ms; ++m1) {
+        cc += f[m++] * p1 * p2;
+        p1 *= a1;
       }
       p2 *= a2;
     }
@@ -74,21 +74,21 @@ void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX
   }
 }
 
-void dirft2d3(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, CPX* f)
+void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t,
+              CPX *f)
 /* Direct computation of 2D type-3 nonuniform FFT. Interface same as finufft2d3
 c               nj-1
 c     f[k]  =   SUM   c[j] exp(+-i (s[k] x[j] + t[k] y[j]))
-c               j=0                   
+c               j=0
 c                    for k = 0, ..., nk-1
 c  If iflag>0 the + sign is used, otherwise the - sign is used, in the
 c  exponential. Uses C++ complex type. Simple brute force.  Barnett 1/26/17
 */
 {
-  for (BIGINT k=0;k<nk;++k) {
-    CPX ss = (iflag>0) ? IMA*s[k] : -IMA*s[k];
-    CPX tt = (iflag>0) ? IMA*t[k] : -IMA*t[k];
-    f[k] = CPX(0,0);
-    for (BIGINT j=0;j<nj;++j)
-      f[k] += c[j] * exp(ss*x[j] + tt*y[j]);
+  for (BIGINT k = 0; k < nk; ++k) {
+    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
+    CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k];
+    f[k]   = CPX(0, 0);
+    for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j]);
   }
 }
diff --git a/test/directft/dirft3d.cpp b/test/directft/dirft3d.cpp
index 63e002283..452b62471 100644
--- a/test/directft/dirft3d.cpp
+++ b/test/directft/dirft3d.cpp
@@ -1,11 +1,12 @@
-#include <finufft/dirft.h>
 #include <finufft/defs.h>
+#include <finufft/dirft.h>
 #include <iostream>
 
 // This is basically a port of dirft2d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft3d1(BIGINT nj,FLT* x,FLT *y,FLT *z, CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f)
+void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
+              BIGINT mu, CPX *f)
 /* Direct computation of 3D type-1 nonuniform FFT. Interface same as finufft3d1.
 c                     nj-1
 c     f[k1,k2,k3] =   SUM  c[j] exp(+-i (k1 x[j] + k2 y[j] + k2 z[j]))
@@ -19,38 +20,39 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 2/1/17
 */
 {
-  BIGINT k1min = -(ms/2), k2min = -(mt/2), k3min = -(mu/2);   // integer divide
-  BIGINT N = ms*mt*mu;        // total # output modes
-  for (BIGINT m=0;m<N;++m) f[m] = CPX(0,0);    // it knows f is complex type
-  for (BIGINT j=0;j<nj;++j) {            // src pts
-    CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
-    CPX a3 = (iflag>0) ? exp(IMA*z[j]) : exp(-IMA*z[j]);
-    CPX sp1 = pow(a1,(FLT)k1min);  // starting phase for most neg k1 freq
-    CPX sp2 = pow(a2,(FLT)k2min);
-    CPX p3 = pow(a3,(FLT)k3min);
-    CPX cc = c[j];                 // no 1/nj norm
-    BIGINT m=0;      // output pointer
-    for (BIGINT m3=0;m3<mu;++m3) {
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2), k3min = -(mu / 2); // integer divide
+  BIGINT N = ms * mt * mu;                                        // total # output modes
+  for (BIGINT m = 0; m < N; ++m) f[m] = CPX(0, 0); // it knows f is complex type
+  for (BIGINT j = 0; j < nj; ++j) {                // src pts
+    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
+    CPX a3   = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]);
+    CPX sp1  = pow(a1, (FLT)k1min); // starting phase for most neg k1 freq
+    CPX sp2  = pow(a2, (FLT)k2min);
+    CPX p3   = pow(a3, (FLT)k3min);
+    CPX cc   = c[j]; // no 1/nj norm
+    BIGINT m = 0;    // output pointer
+    for (BIGINT m3 = 0; m3 < mu; ++m3) {
       CPX p2 = sp2;
-      for (BIGINT m2=0;m2<mt;++m2) {
-	CPX p1 = sp1;
-	for (BIGINT m1=0;m1<ms;++m1) {
-	  f[m++] += cc * p1 * p2 * p3;
-	  p1 *= a1;
-	}
-	p2 *= a2;
+      for (BIGINT m2 = 0; m2 < mt; ++m2) {
+        CPX p1 = sp1;
+        for (BIGINT m1 = 0; m1 < ms; ++m1) {
+          f[m++] += cc * p1 * p2 * p3;
+          p1 *= a1;
+        }
+        p2 *= a2;
       }
       p3 *= a3;
     }
   }
 }
 
-void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f)
+void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
+              BIGINT mu, CPX *f)
 /* Direct computation of 3D type-2 nonuniform FFT. Interface same as finufft3d2
 
-     c[j] =   SUM    f[k1,k2,k3] exp(+-i (k1 x[j] + k2 y[j] + k3 z[j])) 
-            k1,k2,k3  
+     c[j] =   SUM    f[k1,k2,k3] exp(+-i (k1 x[j] + k2 y[j] + k3 z[j]))
+            k1,k2,k3
                             for j = 0,...,nj-1
     where sum is over -ms/2 <= k1 <= (ms-1)/2,  -mt/2 <= k2 <= (mt-1)/2,
                       -mu/2 <= k3 <= (mu-1)/2.
@@ -62,25 +64,25 @@ void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT
     Uses C++ complex type and winding trick.  Barnett 2/1/17
 */
 {
-  BIGINT k1min = -(ms/2), k2min = -(mt/2), k3min = -(mu/2);  // integer divide
-  for (BIGINT j=0;j<nj;++j) {
-    CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
-    CPX a3 = (iflag>0) ? exp(IMA*z[j]) : exp(-IMA*z[j]);
-    CPX sp1 = pow(a1,(FLT)k1min);
-    CPX sp2 = pow(a2,(FLT)k2min);
-    CPX p3 = pow(a3,(FLT)k3min);
-    CPX cc = CPX(0,0);
-    BIGINT m=0;      // input pointer
-    for (BIGINT m3=0;m3<mu;++m3) {
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2), k3min = -(mu / 2); // integer divide
+  for (BIGINT j = 0; j < nj; ++j) {
+    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
+    CPX a3   = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]);
+    CPX sp1  = pow(a1, (FLT)k1min);
+    CPX sp2  = pow(a2, (FLT)k2min);
+    CPX p3   = pow(a3, (FLT)k3min);
+    CPX cc   = CPX(0, 0);
+    BIGINT m = 0; // input pointer
+    for (BIGINT m3 = 0; m3 < mu; ++m3) {
       CPX p2 = sp2;
-      for (BIGINT m2=0;m2<mt;++m2) {
-	CPX p1 = sp1;
-	for (BIGINT m1=0;m1<ms;++m1) {
-	  cc += f[m++] * p1 * p2 * p3;
-	  p1 *= a1;
-	}
-	p2 *= a2;
+      for (BIGINT m2 = 0; m2 < mt; ++m2) {
+        CPX p1 = sp1;
+        for (BIGINT m1 = 0; m1 < ms; ++m1) {
+          cc += f[m++] * p1 * p2 * p3;
+          p1 *= a1;
+        }
+        p2 *= a2;
       }
       p3 *= a3;
     }
@@ -88,22 +90,22 @@ void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT
   }
 }
 
-void dirft3d3(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, FLT *u, CPX* f)
+void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s,
+              FLT *t, FLT *u, CPX *f)
 /* Direct computation of 3D type-3 nonuniform FFT. Interface same as finufft3d3
 c               nj-1
 c     f[k]  =   SUM   c[j] exp(+-i (s[k] x[j] + t[k] y[j] + u[k] z[j]))
-c               j=0                   
+c               j=0
 c                    for k = 0, ..., nk-1
 c  If iflag>0 the + sign is used, otherwise the - sign is used, in the
 c  exponential. Uses C++ complex type. Simple brute force.  Barnett 2/1/17
 */
 {
-  for (BIGINT k=0;k<nk;++k) {
-    CPX ss = (iflag>0) ? IMA*s[k] : -IMA*s[k];
-    CPX tt = (iflag>0) ? IMA*t[k] : -IMA*t[k];
-    CPX uu = (iflag>0) ? IMA*u[k] : -IMA*u[k];
-    f[k] = CPX(0,0);
-    for (BIGINT j=0;j<nj;++j)
-      f[k] += c[j] * exp(ss*x[j] + tt*y[j] + uu*z[j]);
+  for (BIGINT k = 0; k < nk; ++k) {
+    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
+    CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k];
+    CPX uu = (iflag > 0) ? IMA * u[k] : -IMA * u[k];
+    f[k]   = CPX(0, 0);
+    for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j] + uu * z[j]);
   }
 }
diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp
index b1e8bc6a9..d48757aee 100644
--- a/test/dumbinputs.cpp
+++ b/test/dumbinputs.cpp
@@ -6,7 +6,7 @@
 
    Usage (linux):  ./dumbinputs{f} 2> /dev/null
    (since FINUFFT will spit msgs to stderr, to be ignored)
-   
+
    Pass: exit code 0. (Stdout should indicate passed)
    Fail: exit code>0. (Stdout may indicate what failed)
 
@@ -24,61 +24,62 @@
    Removed the chkbnds case to 1d1, 05/08/2024.
 
    Suggested compile:
-   g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lfftw3_omp -lm
-   g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputsf -lfftw3 -lfftw3_omp -lm -DSINGLE
+   g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs
+   -lfftw3 -lfftw3_omp -lm g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include
+   ../lib/libfinufft.so -o dumbinputsf -lfftw3 -lfftw3_omp -lm -DSINGLE
 
    or if you have built a single-core version:
-   g++ -std=c++14 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lm
-   etc
+   g++ -std=c++14 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3
+   -lm etc
 */
 
 // This switches FLT macro from double to float if SINGLE is defined, etc...
-#include <finufft/test_defs.h>
 #include "directft/dirft1d.cpp"
 #include "directft/dirft2d.cpp"
 #include "directft/dirft3d.cpp"
+#include <finufft/test_defs.h>
 using namespace std;
-using namespace finufft::utils;        // for twonorm, etc
+using namespace finufft::utils; // for twonorm, etc
 
-int main(int argc, char* argv[])
-{
-  int M = 100;            // number of nonuniform points
-  int N = 10;             // # modes, keep small, also output NU pts in type 3
+int main(int argc, char *argv[]) {
+  int M = 100;    // number of nonuniform points
+  int N = 10;     // # modes, keep small, also output NU pts in type 3
 #ifdef SINGLE
-  FLT acc = 1e-5;         // desired accuracy for NUFFTs  (prec-dep)
+  FLT acc = 1e-5; // desired accuracy for NUFFTs  (prec-dep)
 #else
-  FLT acc = 1e-8;         // desired accuracy for NUFFTs
+  FLT acc = 1e-8; // desired accuracy for NUFFTs
 #endif
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
 
-  int NN = N*N*N;         // modes F alloc size since we'll go to 3d
+  int NN = N * N * N; // modes F alloc size since we'll go to 3d
   // generate some "random" nonuniform points (x) and complex strengths (c):
-  FLT *x = (FLT *)malloc(sizeof(FLT)*M);
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M);
-  for (int j=0; j<M; ++j) {
-    x[j] = PI*cos((FLT)j);                           // deterministic
-    c[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j);
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M);
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = PI * cos((FLT)j); // deterministic
+    c[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j);
   }
   // allocate output array F for Fourier modes, fix some type-3 coords...
-  CPX* F = (CPX*)malloc(sizeof(CPX)*NN);
-  FLT *s = (FLT*)malloc(sizeof(FLT)*N);
-  for (int k=0; k<N; ++k) s[k] = 10 * cos(1.2*k);   // normal-sized coords
-  FLT *shuge = (FLT*)malloc(sizeof(FLT)*N);
-  FLT huge = 1e12;                                  // no smaller than MAX_NF
-  for (int k=0; k<N; ++k) shuge[k] = huge * s[k];   // some huge coords
+  CPX *F = (CPX *)malloc(sizeof(CPX) * NN);
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N);
+  for (int k = 0; k < N; ++k) s[k] = 10 * cos(1.2 * k); // normal-sized coords
+  FLT *shuge = (FLT *)malloc(sizeof(FLT) * N);
+  FLT huge   = 1e12;                                    // no smaller than MAX_NF
+  for (int k = 0; k < N; ++k) shuge[k] = huge * s[k];   // some huge coords
 
   // alloc exact output array
-  CPX* Fe = (CPX*)malloc(sizeof(CPX)*NN);
- 
+  CPX *Fe = (CPX *)malloc(sizeof(CPX) * NN);
+
   // some useful debug printing...
-  //for (int k=0;k<N;++k) printf("F[%d] = %g+%gi\n",k,real(F[k]),imag(F[k]));
-  //for (int j=0;j<M;++j) printf("c[%d] = %g+%gi\n",j,real(c[j]),imag(c[j]));
-  //printf("%.3g %3g\n",twonorm(N,F),twonorm(M,c));
-  opts.debug = 0;   // set to 1,2, to debug inside FINUFFT, etc segfaults
+  // for (int k=0;k<N;++k) printf("F[%d] = %g+%gi\n",k,real(F[k]),imag(F[k]));
+  // for (int j=0;j<M;++j) printf("c[%d] = %g+%gi\n",j,real(c[j]),imag(c[j]));
+  // printf("%.3g %3g\n",twonorm(N,F),twonorm(M,c));
+  opts.debug        = 0; // set to 1,2, to debug inside FINUFFT, etc segfaults
   opts.spread_debug = 0;
 
-  opts.nthreads = 1;       // to keep them fast (thread-launch is slow)
-  
+  opts.nthreads = 1; // to keep them fast (thread-launch is slow)
+
 #ifdef SINGLE
   printf("dumbinputsf test start...\n");
 #else
@@ -87,601 +88,626 @@ int main(int argc, char* argv[])
 
   // 111111111111111111111111111111111111111111111111111111111111111111111111
   printf("1D dumb cases.\n");
-  int ier = FINUFFT1D1(M,x,c,+1,0,N,F,&opts);
+  int ier = FINUFFT1D1(M, x, c, +1, 0, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d1 tol=0:\twrong err code %d\n",ier);
+    printf("1d1 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D1(M,x,c,+1,acc,0,F,&opts);
+  ier = FINUFFT1D1(M, x, c, +1, acc, 0, F, &opts);
   if (ier) {
-    printf("1d1 N=0:\tier=%d\n",ier);
+    printf("1d1 N=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT1D1(-1,x,c,+1,acc,0,F,&opts);
+  ier = FINUFFT1D1(-1, x, c, +1, acc, 0, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
-    printf("1d1 M<0:\twrong err code %d\n",ier);
+    printf("1d1 M<0:\twrong err code %d\n", ier);
     return 1;
   }
-  int64_t Mhuge = (int64_t)(1e16);   // cf defs.h MAX_NU_PTS
-  ier = FINUFFT1D1(Mhuge,x,c,+1,acc,0,F,&opts);
+  int64_t Mhuge = (int64_t)(1e16); // cf defs.h MAX_NU_PTS
+  ier           = FINUFFT1D1(Mhuge, x, c, +1, acc, 0, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
-    printf("1d1 M huge:\twrong err code %d\n",ier);
+    printf("1d1 M huge:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D1(0,x,c,+1,acc,N,F,&opts);
-  FLT t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("1d1 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier   = FINUFFT1D1(0, x, c, +1, acc, N, F, &opts);
+  FLT t = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("1d1 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set F for t2
-  ier = FINUFFT1D2(M,x,c,+1,0,N,F,&opts);
+  for (int k = 0; k < NN; ++k)
+    F[k] = sin((FLT)0.7 * k) + IMA * cos((FLT)0.3 * k); // set F for t2
+  ier = FINUFFT1D2(M, x, c, +1, 0, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d2 tol=0:\twrong err code %d\n",ier);
+    printf("1d2 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D2(M,x,c,+1,acc,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("1d2 N=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT1D2(M, x, c, +1, acc, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("1d2 N=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT1D2(0,x,c,+1,acc,N,F,&opts);
+  ier = FINUFFT1D2(0, x, c, +1, acc, N, F, &opts);
   if (ier) {
-    printf("1d2 M=0:\tier=%d\n",ier);
+    printf("1d2 M=0:\tier=%d\n", ier);
     return ier;
   }
-  for (int j=0; j<M; ++j) c[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset c for t3
-  ier = FINUFFT1D3(M,x,c,+1,0,N,s,F,&opts);
+  for (int j = 0; j < M; ++j)
+    c[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // reset c for t3
+  ier = FINUFFT1D3(M, x, c, +1, 0, N, s, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d3 tol=0:\twrong err code %d\n",ier);
+    printf("1d3 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,0,s,F,&opts);
+  ier = FINUFFT1D3(M, x, c, +1, acc, 0, s, F, &opts);
   if (ier) {
-    printf("1d3 nk=0:\tier=%d\n",ier);
+    printf("1d3 nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,-1,s,F,&opts);
+  ier = FINUFFT1D3(M, x, c, +1, acc, -1, s, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
-    printf("1d3 nk=-1:\twrong err code %d\n",ier);
+    printf("1d3 nk=-1:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,Mhuge,s,F,&opts);
+  ier = FINUFFT1D3(M, x, c, +1, acc, Mhuge, s, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
-    printf("1d3 nk huge:\twrong err code %d\n",ier);
+    printf("1d3 nk huge:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3(0,x,c,+1,acc,N,s,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("1d3 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT1D3(0, x, c, +1, acc, N, s, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("1d3 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
   // for type 3 only we include crude accuracy check for 1-NUpt (I/O) cases...
-  ier = FINUFFT1D3(1,x,c,+1,acc,N,s,F,&opts);   // XK prod formally 0
-  dirft1d3(1,x,c,+1,N,s,Fe); for (int k=0; k<N; ++k) F[k] -= Fe[k]; // acc chk
-  FLT err = twonorm(N,F)/sqrt((FLT)N);
-  if (ier || err>100*acc) {
-    printf("1d3 M=1:\tier=%d nrm(err)=%.3g\n",ier,err);
+  ier = FINUFFT1D3(1, x, c, +1, acc, N, s, F, &opts); // XK prod formally 0
+  dirft1d3(1, x, c, +1, N, s, Fe);
+  for (int k = 0; k < N; ++k) F[k] -= Fe[k];          // acc chk
+  FLT err = twonorm(N, F) / sqrt((FLT)N);
+  if (ier || err > 100 * acc) {
+    printf("1d3 M=1:\tier=%d nrm(err)=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,1,s,F,&opts);
-  dirft1d3(M,x,c,+1,1,s,Fe);
-  err = abs(F[0]-Fe[0]);
-  if (ier || err>10*acc) {
-    printf("1d3 nk=1:\tier=%d err=%.3g\n",ier,err);
+  ier = FINUFFT1D3(M, x, c, +1, acc, 1, s, F, &opts);
+  dirft1d3(M, x, c, +1, 1, s, Fe);
+  err = abs(F[0] - Fe[0]);
+  if (ier || err > 10 * acc) {
+    printf("1d3 nk=1:\tier=%d err=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3(1,x,c,+1,acc,1,s,F,&opts);
-  dirft1d3(1,x,c,+1,1,s,Fe);
-  err = abs(F[0]-Fe[0]);
-  if (ier || err>10*acc) {
-    printf("1d3 M=nk=1:\tier=%d err=%.3g\n",ier,err);
+  ier = FINUFFT1D3(1, x, c, +1, acc, 1, s, F, &opts);
+  dirft1d3(1, x, c, +1, 1, s, Fe);
+  err = abs(F[0] - Fe[0]);
+  if (ier || err > 10 * acc) {
+    printf("1d3 M=nk=1:\tier=%d err=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,N,shuge,F,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("1d3 XK prod too big:\twrong error code %d\n",ier);
+  ier = FINUFFT1D3(M, x, c, +1, acc, N, shuge, F, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("1d3 XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  int ndata = 10;                 // how many multiple vectors to test it on
-  CPX* cm = (CPX*)malloc(sizeof(CPX)*M*ndata);
-  CPX* Fm = (CPX*)malloc(sizeof(CPX)*NN*ndata);     // the biggest array
-  for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // set cm for 1d1many
-  ier = FINUFFT1D1MANY(0,M,x,cm,+1,0,N,Fm,&opts);
+  int ndata = 10; // how many multiple vectors to test it on
+  CPX *cm   = (CPX *)malloc(sizeof(CPX) * M * ndata);
+  CPX *Fm   = (CPX *)malloc(sizeof(CPX) * NN * ndata);   // the biggest array
+  for (int j = 0; j < M * ndata; ++j)
+    cm[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // set cm for 1d1many
+  ier = FINUFFT1D1MANY(0, M, x, cm, +1, 0, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("1d1many ndata=0:\twrong err code %d\n",ier);
+    printf("1d1many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D1MANY(ndata,M,x,cm,+1,0,N,Fm,&opts);
+  ier = FINUFFT1D1MANY(ndata, M, x, cm, +1, 0, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d1many tol=0:\twrong err code %d\n",ier);
+    printf("1d1many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D1MANY(ndata,M,x,cm,+1,acc,0,Fm,&opts);
+  ier = FINUFFT1D1MANY(ndata, M, x, cm, +1, acc, 0, Fm, &opts);
   if (ier) {
-    printf("1d1many N=0:\tier=%d\n",ier);
+    printf("1d1many N=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT1D1MANY(ndata,0,x,cm,+1,acc,N,Fm,&opts);
-  t = twonorm(N*ndata,Fm);
-  if (ier || t!=0.0) {
-    printf("1d1many M=0:\tier=%d nrm(Fm)=%.3g\n",ier,t);
+  ier = FINUFFT1D1MANY(ndata, 0, x, cm, +1, acc, N, Fm, &opts);
+  t   = twonorm(N * ndata, Fm);
+  if (ier || t != 0.0) {
+    printf("1d1many M=0:\tier=%d nrm(Fm)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set Fm for 1d2many
-  ier = FINUFFT1D2MANY(0,M,x,cm,+1,0,N,Fm,&opts);
+  for (int k = 0; k < NN * ndata; ++k)
+    Fm[k] = sin((FLT)0.7 * k) + IMA * cos((FLT)0.3 * k); // set Fm for 1d2many
+  ier = FINUFFT1D2MANY(0, M, x, cm, +1, 0, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("1d2many ndata=0:\twrong err code %d\n",ier);
+    printf("1d2many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D2MANY(ndata,M,x,cm,+1,0,N,Fm,&opts);
+  ier = FINUFFT1D2MANY(ndata, M, x, cm, +1, 0, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d2many tol=0:\twrong err code %d\n",ier);
+    printf("1d2many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D2MANY(ndata,M,x,cm,+1,acc,0,Fm,&opts);
-  t = twonorm(N*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("1d2many N=0:\tier=%d nrm(cm)=%.3g\n",ier,t);
+  ier = FINUFFT1D2MANY(ndata, M, x, cm, +1, acc, 0, Fm, &opts);
+  t   = twonorm(N * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("1d2many N=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT1D2MANY(ndata,0,x,cm,+1,acc,N,Fm,&opts);
+  ier = FINUFFT1D2MANY(ndata, 0, x, cm, +1, acc, N, Fm, &opts);
   if (ier) {
-    printf("1d2many M=0:\tier=%d\n",ier);
+    printf("1d2many M=0:\tier=%d\n", ier);
     return ier;
   }
-  for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset cm for 1d3many
-  ier = FINUFFT1D3MANY(0, M,x,cm,+1,acc,N,s,Fm,&opts);
+  for (int j = 0; j < M * ndata; ++j)
+    cm[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // reset cm for 1d3many
+  ier = FINUFFT1D3MANY(0, M, x, cm, +1, acc, N, s, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("1d3many ndata=0:\twrong err code %d\n",ier);
+    printf("1d3many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata, M,x,cm,+1,0,N,s,Fm,&opts);
+  ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, 0, N, s, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d3many tol=0:\twrong err code %d\n",ier);
+    printf("1d3many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata, M,x,cm,+1,acc,0,s,Fm,&opts);
+  ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, acc, 0, s, Fm, &opts);
   if (ier) {
-    printf("1d3many nk=0:\tier=%d\n",ier);
+    printf("1d3many nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT1D3MANY(ndata, 0,x,cm,+1,acc,N,s,Fm,&opts);
-  t = twonorm(N,Fm);
+  ier = FINUFFT1D3MANY(ndata, 0, x, cm, +1, acc, N, s, Fm, &opts);
+  t   = twonorm(N, Fm);
   // again, as above, only crude acc tests for 1-NUpt (I/O) case...
-  ier = FINUFFT1D3MANY(ndata, 1,x,cm,+1,acc,N,s,Fm,&opts);   // XK prod formally 0
-  dirft1d3(1,x,c,+1,N,s,Fe); for (int k=0; k<N; ++k) Fm[k] -= Fe[k]; // acc chk
-  err = twonorm(N,Fm)/sqrt((FLT)N);  // rms, to 5e-5 abs; check just first trial
-  if (ier || err>100*acc) {
-    printf("1d3many M=1:\tier=%d nrm(err)=%.3g\n",ier,err);
+  ier = FINUFFT1D3MANY(ndata, 1, x, cm, +1, acc, N, s, Fm, &opts); // XK prod formally 0
+  dirft1d3(1, x, c, +1, N, s, Fe);
+  for (int k = 0; k < N; ++k) Fm[k] -= Fe[k];                      // acc chk
+  err = twonorm(N, Fm) / sqrt((FLT)N); // rms, to 5e-5 abs; check just first trial
+  if (ier || err > 100 * acc) {
+    printf("1d3many M=1:\tier=%d nrm(err)=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata,M,x,cm,+1,acc,1,s,Fm,&opts);
-  dirft1d3(M,x,c,+1,1,s,Fe);
-  err = abs(Fm[0]-Fe[0]);
-  if (ier || err>10*acc) {
-    printf("1d3many nk=1:\tier=%d err=%.3g\n",ier,err);
+  ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, acc, 1, s, Fm, &opts);
+  dirft1d3(M, x, c, +1, 1, s, Fe);
+  err = abs(Fm[0] - Fe[0]);
+  if (ier || err > 10 * acc) {
+    printf("1d3many nk=1:\tier=%d err=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata,1,x,cm,+1,acc,1,s,Fm,&opts);
-  dirft1d3(1,x,c,+1,1,s,Fe);
-  err = abs(Fm[0]-Fe[0]);
-  if (ier || err>10*acc) {
-    printf("1d3many M=nk=1:\tier=%d err=%.3g\n",ier,err);
+  ier = FINUFFT1D3MANY(ndata, 1, x, cm, +1, acc, 1, s, Fm, &opts);
+  dirft1d3(1, x, c, +1, 1, s, Fe);
+  err = abs(Fm[0] - Fe[0]);
+  if (ier || err > 10 * acc) {
+    printf("1d3many M=nk=1:\tier=%d err=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata,M,x,cm,+1,acc,N,shuge,Fm,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("1d3many XK prod too big:\twrong error code %d\n",ier);
+  ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, acc, N, shuge, Fm, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("1d3many XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
 
   // 2222222222222222222222222222222222222222222222222222222222222222222222222
   printf("2D dumb cases.\n"); // (uses y=x, and t=s in type 3)
-  ier = FINUFFT2D1(M,x,x,c,+1,0,N,N,F,&opts);
+  ier = FINUFFT2D1(M, x, x, c, +1, 0, N, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d1 tol=0:\twrong err code %d\n",ier);
+    printf("2d1 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D1(M,x,x,c,+1,acc,0,0,F,&opts);
+  ier = FINUFFT2D1(M, x, x, c, +1, acc, 0, 0, F, &opts);
   if (ier) {
-    printf("2d1 Ns=Nt=0:\tier=%d\n",ier);
+    printf("2d1 Ns=Nt=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1(M,x,x,c,+1,acc,0,N,F,&opts);
+  ier = FINUFFT2D1(M, x, x, c, +1, acc, 0, N, F, &opts);
   if (ier) {
-    printf("2d1 Ns=0,Nt>0:\tier=%d\n",ier);
+    printf("2d1 Ns=0,Nt>0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1(M,x,x,c,+1,acc,N,0,F,&opts);
+  ier = FINUFFT2D1(M, x, x, c, +1, acc, N, 0, F, &opts);
   if (ier) {
-    printf("2d1 Ns>0,Nt=0:\tier=%d\n",ier);
+    printf("2d1 Ns>0,Nt=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1(0,x,x,c,+1,acc,N,N,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("2d1 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT2D1(0, x, x, c, +1, acc, N, N, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("2d1 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set F for t2
-  ier = FINUFFT2D2(M,x,x,c,+1,0,N,N,F,&opts);
+  for (int k = 0; k < NN; ++k)
+    F[k] = sin((FLT)0.7 * k) + IMA * cos((FLT)0.3 * k); // set F for t2
+  ier = FINUFFT2D2(M, x, x, c, +1, 0, N, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d2 tol=0:\twrong err code %d\n",ier);
+    printf("2d2 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D2(M,x,x,c,+1,acc,0,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("2d2 Ns=Nt=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT2D2(M, x, x, c, +1, acc, 0, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("2d2 Ns=Nt=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2(M,x,x,c,+1,acc,0,N,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("2d2 Ns=0,Nt>0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT2D2(M, x, x, c, +1, acc, 0, N, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("2d2 Ns=0,Nt>0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2(M,x,x,c,+1,acc,N,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("2d2 Ns>0,Nt=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT2D2(M, x, x, c, +1, acc, N, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("2d2 Ns>0,Nt=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2(0,x,x,c,+1,acc,N,N,F,&opts);
+  ier = FINUFFT2D2(0, x, x, c, +1, acc, N, N, F, &opts);
   if (ier) {
-    printf("2d2 M=0:\tier=%d\n",ier);
+    printf("2d2 M=0:\tier=%d\n", ier);
     return ier;
   }
-  for (int j=0; j<M; ++j) c[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset c for t3
-  ier = FINUFFT2D3(M,x,x,c,+1,0,N,s,s,F,&opts);
+  for (int j = 0; j < M; ++j)
+    c[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // reset c for t3
+  ier = FINUFFT2D3(M, x, x, c, +1, 0, N, s, s, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d3 tol=0:\twrong err code %d\n",ier);
+    printf("2d3 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D3(M,x,x,c,+1,acc,0,s,s,F,&opts);
+  ier = FINUFFT2D3(M, x, x, c, +1, acc, 0, s, s, F, &opts);
   if (ier) {
-    printf("2d3 nk=0:\tier=%d\n",ier);
+    printf("2d3 nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D3(0,x,x,c,+1,acc,N,s,s,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("2d3 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT2D3(0, x, x, c, +1, acc, N, s, s, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("2d3 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D3(1,x,x,c,+1,acc,N,s,s,F,&opts);   // XK prod formally 0
+  ier = FINUFFT2D3(1, x, x, c, +1, acc, N, s, s, F, &opts); // XK prod formally 0
   // we don't check the M=nk=1 case for >1D since guess that 1D would catch it.
   if (ier) {
-    printf("2d3 M=nk=1:\tier=%d\n",ier);
+    printf("2d3 M=nk=1:\tier=%d\n", ier);
     return ier;
-  }  
-  for (int k=0; k<N; ++k) shuge[k] = sqrt(huge)*s[k];     // less huge coords
-  ier = FINUFFT2D3(M,x,x,c,+1,acc,N,shuge,shuge,F,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("2d3 XK prod too big:\twrong error code %d\n",ier);
+  }
+  for (int k = 0; k < N; ++k) shuge[k] = sqrt(huge) * s[k]; // less huge coords
+  ier = FINUFFT2D3(M, x, x, c, +1, acc, N, shuge, shuge, F, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("2d3 XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset cm for 2d1many
-  ier = FINUFFT2D1MANY(0,M,x,x,cm,+1,0,N,N,Fm,&opts);
+  for (int j = 0; j < M * ndata; ++j)
+    cm[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // reset cm for 2d1many
+  ier = FINUFFT2D1MANY(0, M, x, x, cm, +1, 0, N, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("2d1many ndata=0:\twrong err code %d\n",ier);
+    printf("2d1many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,0,N,N,Fm,&opts);
+  ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, 0, N, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d1many tol=0:\twrong err code %d\n",ier);
+    printf("2d1many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,acc,0,0,Fm,&opts);
+  ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, acc, 0, 0, Fm, &opts);
   if (ier) {
-    printf("2d1many Ns=Nt=0:\tier=%d\n",ier);
+    printf("2d1many Ns=Nt=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,acc,0,N,Fm,&opts);
+  ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, acc, 0, N, Fm, &opts);
   if (ier) {
-    printf("2d1many Ns=0,Nt>0:\tier=%d\n",ier);
+    printf("2d1many Ns=0,Nt>0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,acc,N,0,Fm,&opts);
+  ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, acc, N, 0, Fm, &opts);
   if (ier) {
-    printf("2d1many Ns>0,Nt=0:\tier=%d\n",ier);
+    printf("2d1many Ns>0,Nt=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1MANY(ndata,0,x,x,cm,+1,acc,N,N,Fm,&opts);
-  t = twonorm(N*ndata,Fm);
-  if (ier || t!=0.0) {
-    printf("2d1many M=0:\tier=%d nrm(Fm)=%.3g\n",ier,t);
+  ier = FINUFFT2D1MANY(ndata, 0, x, x, cm, +1, acc, N, N, Fm, &opts);
+  t   = twonorm(N * ndata, Fm);
+  if (ier || t != 0.0) {
+    printf("2d1many M=0:\tier=%d nrm(Fm)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // reset Fm for t2
-  ier = FINUFFT2D2MANY(0,M,x,x,cm,+1,0,N,N,Fm,&opts);
+  for (int k = 0; k < NN * ndata; ++k)
+    Fm[k] = sin((FLT)0.7 * k) + IMA * cos((FLT)0.3 * k); // reset Fm for t2
+  ier = FINUFFT2D2MANY(0, M, x, x, cm, +1, 0, N, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("2d2many ndata=0:\twrong err code %d\n",ier);
+    printf("2d2many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,0,N,N,Fm,&opts);
+  ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, 0, N, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d2many tol=0:\twrong err code %d\n",ier);
+    printf("2d2many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,acc,0,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("2d2many Ns=Nt=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, acc, 0, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("2d2many Ns=Nt=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,acc,0,N,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("2d2many Ns=0,Nt>0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, acc, 0, N, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("2d2many Ns=0,Nt>0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,acc,N,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("2d2many Ns>0,Nt=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, acc, N, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("2d2many Ns>0,Nt=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,0,x,x,cm,+1,acc,N,N,Fm,&opts);
+  ier = FINUFFT2D2MANY(ndata, 0, x, x, cm, +1, acc, N, N, Fm, &opts);
   if (ier) {
-    printf("2d2many M=0:\tier=%d\n",ier);
+    printf("2d2many M=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D3MANY(0,M,x,x,cm,+1,0,N,s,s,Fm,&opts);
+  ier = FINUFFT2D3MANY(0, M, x, x, cm, +1, 0, N, s, s, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("2d3many ndata=0:\twrong err code %d\n",ier);
+    printf("2d3many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,0,N,s,s,Fm,&opts);
+  ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, 0, N, s, s, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d3many tol=0:\twrong err code %d\n",ier);
+    printf("2d3many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,acc,0,s,s,Fm,&opts);
+  ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, acc, 0, s, s, Fm, &opts);
   if (ier) {
-    printf("2d3many nk=0:\tier=%d\n",ier);
+    printf("2d3many nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D3MANY(ndata,0,x,x,cm,+1,acc,N,s,s,Fm,&opts);
-  t = twonorm(N,Fm);
-  if (ier || t!=0.0) {
-    printf("2d3many M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT2D3MANY(ndata, 0, x, x, cm, +1, acc, N, s, s, Fm, &opts);
+  t   = twonorm(N, Fm);
+  if (ier || t != 0.0) {
+    printf("2d3many M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D3MANY(ndata,1,x,x,cm,+1,acc,N,s,s,Fm,&opts); // XK prod formally 0
+  ier = FINUFFT2D3MANY(ndata, 1, x, x, cm, +1, acc, N, s, s, Fm, &opts); // XK prod
+                                                                         // formally 0
   // we don't check the M=nk=1 case for >1D since guess that 1D would catch it.
   if (ier) {
-    printf("2d3many M=nk=1:\tier=%d\n",ier);
+    printf("2d3many M=nk=1:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,acc,N,shuge,shuge,Fm,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("2d3many XK prod too big:\twrong error code %d\n",ier);
+  ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, acc, N, shuge, shuge, Fm, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("2d3many XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  
+
   // 3333333333333333333333333333333333333333333333333333333333333333333333333
-  printf("3D dumb cases.\n");    // z=y=x, and u=t=s in type 3
-  ier = FINUFFT3D1(M,x,x,x,c,+1,0,N,N,N,F,&opts);
+  printf("3D dumb cases.\n"); // z=y=x, and u=t=s in type 3
+  ier = FINUFFT3D1(M, x, x, x, c, +1, 0, N, N, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d1 tol=0:\twrong err code %d\n",ier);
+    printf("3d1 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D1(M,x,x,x,c,+1,acc,0,0,0,F,&opts);
+  ier = FINUFFT3D1(M, x, x, x, c, +1, acc, 0, 0, 0, F, &opts);
   if (ier) {
-    printf("3d1 Ns=Nt=Nu=0:\tier=%d\n",ier);
+    printf("3d1 Ns=Nt=Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1(M,x,x,x,c,+1,acc,0,N,0,F,&opts);
+  ier = FINUFFT3D1(M, x, x, x, c, +1, acc, 0, N, 0, F, &opts);
   if (ier) {
-    printf("3d1 Ns=0,Nt>0,Nu=0:\tier=%d\n",ier);
+    printf("3d1 Ns=0,Nt>0,Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1(M,x,x,x,c,+1,acc,N,0,N,F,&opts);
+  ier = FINUFFT3D1(M, x, x, x, c, +1, acc, N, 0, N, F, &opts);
   if (ier) {
-    printf("3d1 Ns>0,Nt=0,Nu>0:\tier=%d\n",ier);
+    printf("3d1 Ns>0,Nt=0,Nu>0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1(0,x,x,x,c,+1,acc,N,N,N,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("3d1 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT3D1(0, x, x, x, c, +1, acc, N, N, N, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("3d1 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.8*k) - IMA*cos((FLT)0.3*k);  // set F for t2
-  ier = FINUFFT3D2(M,x,x,x,c,+1,0,N,N,N,F,&opts);
+  for (int k = 0; k < NN; ++k)
+    F[k] = sin((FLT)0.8 * k) - IMA * cos((FLT)0.3 * k); // set F for t2
+  ier = FINUFFT3D2(M, x, x, x, c, +1, 0, N, N, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d2 tol=0:\twrong err code %d\n",ier);
+    printf("3d2 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D2(M,x,x,x,c,+1,acc,0,0,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("3d2 Ns=Nt=Nu=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT3D2(M, x, x, x, c, +1, acc, 0, 0, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("3d2 Ns=Nt=Nu=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2(M,x,x,x,c,+1,acc,N,0,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("3d2 Ns>0,Nt=Nu=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT3D2(M, x, x, x, c, +1, acc, N, 0, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("3d2 Ns>0,Nt=Nu=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2(M,x,x,x,c,+1,acc,0,N,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("3d2 Ns=0,Nt>0,Nu=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT3D2(M, x, x, x, c, +1, acc, 0, N, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("3d2 Ns=0,Nt>0,Nu=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2(M,x,x,x,c,+1,acc,0,0,N,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("3d2 Ns=Nt=0,Nu>0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT3D2(M, x, x, x, c, +1, acc, 0, 0, N, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("3d2 Ns=Nt=0,Nu>0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2(0,x,x,x,c,+1,acc,N,N,N,F,&opts);
+  ier = FINUFFT3D2(0, x, x, x, c, +1, acc, N, N, N, F, &opts);
   if (ier) {
-    printf("3d2 M=0:\tier=%d\n",ier);
+    printf("3d2 M=0:\tier=%d\n", ier);
     return ier;
   }
-  for (int j=0; j<M; ++j) c[j] = sin((FLT)1.2*j) - IMA*cos((FLT)0.8*j); // reset c for t3
-  ier = FINUFFT3D3(M,x,x,x,c,+1,0,N,s,s,s,F,&opts);
+  for (int j = 0; j < M; ++j)
+    c[j] = sin((FLT)1.2 * j) - IMA * cos((FLT)0.8 * j); // reset c for t3
+  ier = FINUFFT3D3(M, x, x, x, c, +1, 0, N, s, s, s, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d3 tol=0:\twrong err code %d\n",ier);
+    printf("3d3 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D3(M,x,x,x,c,+1,acc,0,s,s,s,F,&opts);
+  ier = FINUFFT3D3(M, x, x, x, c, +1, acc, 0, s, s, s, F, &opts);
   if (ier) {
-    printf("3d3 nk=0:\tier=%d\n",ier);
+    printf("3d3 nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D3(0,x,x,x,c,+1,acc,N,s,s,s,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("3d3 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT3D3(0, x, x, x, c, +1, acc, N, s, s, s, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("3d3 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D3(1,x,x,x,c,+1,acc,N,s,s,s,F,&opts);   // XK prod formally 0
+  ier = FINUFFT3D3(1, x, x, x, c, +1, acc, N, s, s, s, F, &opts); // XK prod formally 0
   // we don't check the M=nk=1 case for >1D since guess that 1D would catch it.
   if (ier) {
-    printf("3d3 M=nk=1:\tier=%d\n",ier);
+    printf("3d3 M=nk=1:\tier=%d\n", ier);
     return ier;
   }
-  for (int k=0; k<N; ++k) shuge[k] = pow(huge,1./3)*s[k];  // less huge coords
-  ier = FINUFFT3D3(M,x,x,x,c,+1,acc,N,shuge,shuge,shuge,F,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("3d3 XK prod too big:\twrong error code %d\n",ier);
+  for (int k = 0; k < N; ++k) shuge[k] = pow(huge, 1. / 3) * s[k]; // less huge coords
+  ier = FINUFFT3D3(M, x, x, x, c, +1, acc, N, shuge, shuge, shuge, F, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("3d3 XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  for (int j=0; j<M*ndata; ++j) cm[j] = sin(-(FLT)1.2*j) + IMA*cos((FLT)1.1*j); // reset cm for 3d1many
-  ier = FINUFFT3D1MANY(0,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
+  for (int j = 0; j < M * ndata; ++j)
+    cm[j] = sin(-(FLT)1.2 * j) + IMA * cos((FLT)1.1 * j); // reset cm for 3d1many
+  ier = FINUFFT3D1MANY(0, M, x, x, x, cm, +1, 0, N, N, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("3d1many ndata=0:\twrong err code %d\n",ier);
+    printf("3d1many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, 0, N, N, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d1many tol=0:\twrong err code %d\n",ier);
+    printf("3d1many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,0,0,0,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, 0, Fm, &opts);
   if (ier) {
-    printf("3d1many Ns=Nt=Nu=0:\tier=%d\n",ier);
+    printf("3d1many Ns=Nt=Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,N,0,0,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, N, 0, 0, Fm, &opts);
   if (ier) {
-    printf("3d1many Ns>0,Nt=Nu=0:\tier=%d\n",ier);
+    printf("3d1many Ns>0,Nt=Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,0,N,0,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, 0, N, 0, Fm, &opts);
   if (ier) {
-    printf("3d1many Ns=0,Nt>0,Nu=0:\tier=%d\n",ier);
+    printf("3d1many Ns=0,Nt>0,Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,0,0,N,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, N, Fm, &opts);
   if (ier) {
-    printf("3d1many Ns=Nt=0,Nu>0:\tier=%d\n",ier);
+    printf("3d1many Ns=Nt=0,Nu>0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1MANY(ndata,0,x,x,x,cm,+1,acc,N,N,N,Fm,&opts);
-  t = twonorm(N*ndata,Fm);
-  if (ier || t!=0.0) {
-    printf("3d1many M=0:\tier=%d nrm(Fm)=%.3g\n",ier,t);
+  ier = FINUFFT3D1MANY(ndata, 0, x, x, x, cm, +1, acc, N, N, N, Fm, &opts);
+  t   = twonorm(N * ndata, Fm);
+  if (ier || t != 0.0) {
+    printf("3d1many M=0:\tier=%d nrm(Fm)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.6*k) - IMA*cos((FLT)0.3*k);  // reset Fm for t2
-  ier = FINUFFT3D2MANY(0,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
+  for (int k = 0; k < NN * ndata; ++k)
+    Fm[k] = sin((FLT)0.6 * k) - IMA * cos((FLT)0.3 * k); // reset Fm for t2
+  ier = FINUFFT3D2MANY(0, M, x, x, x, cm, +1, 0, N, N, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("3d2many ndata=0:\twrong err code %d\n",ier);
+    printf("3d2many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, 0, N, N, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d2many tol=0:\twrong err code %d\n",ier);
+    printf("3d2many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,0,0,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("3d2many Ns=Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("3d2many Ns=Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,N,0,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("3d2many Ns>0,Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, N, 0, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("3d2many Ns>0,Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,0,N,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("3d2many Ns=0,Nt>0,Nu=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, 0, N, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("3d2many Ns=0,Nt>0,Nu=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,0,0,N,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("3d2many Ns=Nt=0,Nu>0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, N, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("3d2many Ns=Nt=0,Nu>0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,0,x,x,x,cm,+1,acc,N,N,N,Fm,&opts);
+  ier = FINUFFT3D2MANY(ndata, 0, x, x, x, cm, +1, acc, N, N, N, Fm, &opts);
   if (ier) {
-    printf("3d2many M=0:\tier=%d\n",ier);
+    printf("3d2many M=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D3MANY(0,M,x,x,x,cm,+1,0,N,s,s,s,Fm,&opts);
+  ier = FINUFFT3D3MANY(0, M, x, x, x, cm, +1, 0, N, s, s, s, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("3d3many ndata=0:\twrong err code %d\n",ier);
+    printf("3d3many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,0,N,s,s,s,Fm,&opts);
+  ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, 0, N, s, s, s, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d3many tol=0:\twrong err code %d\n",ier);
+    printf("3d3many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,acc,0,s,s,s,Fm,&opts);
+  ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, acc, 0, s, s, s, Fm, &opts);
   if (ier) {
-    printf("3d3many nk=0:\tier=%d\n",ier);
+    printf("3d3many nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D3MANY(ndata,0,x,x,x,cm,+1,acc,N,s,s,s,Fm,&opts);
-  t = twonorm(N,Fm);
-  if (ier || t!=0.0) {
-    printf("3d3many M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT3D3MANY(ndata, 0, x, x, x, cm, +1, acc, N, s, s, s, Fm, &opts);
+  t   = twonorm(N, Fm);
+  if (ier || t != 0.0) {
+    printf("3d3many M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D3MANY(ndata,1,x,x,x,cm,+1,acc,N,s,s,s,Fm,&opts); // XK prod formally 0
+  ier = FINUFFT3D3MANY(ndata, 1, x, x, x, cm, +1, acc, N, s, s, s, Fm, &opts); // XK prod
+                                                                               // formally
+                                                                               // 0
   // we don't check the M=nk=1 case for >1D since guess that 1D would catch it.
   if (ier) {
-    printf("3d3many M=nk=1:\tier=%d\n",ier);
+    printf("3d3many M=nk=1:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,acc,N,shuge,shuge,shuge,Fm,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("3d3many XK prod too big:\twrong error code %d\n",ier);
+  ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, acc, N, shuge, shuge, shuge, Fm, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("3d3many XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  
-  free(x); free(c); free(F); free(s); free(shuge); free(cm); free(Fm); free(Fe);
-  
+
+  free(x);
+  free(c);
+  free(F);
+  free(s);
+  free(shuge);
+  free(cm);
+  free(Fm);
+  free(Fe);
+
   // GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
   // some dumb tests for guru interface to induce free() crash in destroy...
   FINUFFT_PLAN plan;
-  BIGINT Ns[1] = {0};      // since dim=1, don't have to make length 3
-  FINUFFT_MAKEPLAN(1, 1, Ns, +1, 1, acc, &plan, NULL);  // type 1, now kill it
+  BIGINT Ns[1] = {0}; // since dim=1, don't have to make length 3
+  FINUFFT_MAKEPLAN(1, 1, Ns, +1, 1, acc, &plan, NULL); // type 1, now kill it
   FINUFFT_DESTROY(plan);
-  FINUFFT_MAKEPLAN(3, 1, Ns, +1, 1, acc, &plan, NULL);  // type 3, now kill it
+  FINUFFT_MAKEPLAN(3, 1, Ns, +1, 1, acc, &plan, NULL); // type 3, now kill it
   FINUFFT_DESTROY(plan);
   // *** todo: more extensive bad inputs and error catching in guru...
-  
+
 #ifdef SINGLE
   printf("dumbinputsf passed.\n");
 #else
   printf("dumbinputs passed.\n");
 #endif
-  
+
   return 0;
 }
diff --git a/test/finufft1d_test.cpp b/test/finufft1d_test.cpp
index 8dd345b1a..29c0c540f 100644
--- a/test/finufft1d_test.cpp
+++ b/test/finufft1d_test.cpp
@@ -4,120 +4,128 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 1d, all 3 types, either precision.",
-  "",
-  "Usage: finufft1d_test Nmodes Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]",
-  "\teg:\tfinufft1d_test 1e6 1e6 1e-6 1 2 2.0 1e-5",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {
+    "Tester for FINUFFT in 1d, all 3 types, either precision.",
+    "",
+    "Usage: finufft1d_test Nmodes Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]",
+    "\teg:\tfinufft1d_test 1e6 1e6 1e-6 1 2 2.0 1e-5",
+    "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+    NULL};
 // Barnett 1/22/17 onwards
 
-int main(int argc, char* argv[])
-{
-  BIGINT M, N;   // M = # srcs, N = # modes out
-  double w, tol = 1e-6;         // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N;                // M = # srcs, N = # modes out
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);  // put defaults in opts
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts); // put defaults in opts
   // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  int isign = +1;            // choose which exponential sign to test
-  if (argc<3 || argc>8) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 3 || argc > 8) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); N = (BIGINT)w;
-  sscanf(argv[2],"%lf",&w); M = (BIGINT)w;
-  if (argc>3) sscanf(argv[3],"%lf",&tol);
-  if (argc>4) sscanf(argv[4],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>5) sscanf(argv[5],"%d",&opts.spread_sort);
-  if (argc>6) { sscanf(argv[6],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>7) sscanf(argv[7],"%lf",&errfail);
-  
+  sscanf(argv[1], "%lf", &w);
+  N = (BIGINT)w;
+  sscanf(argv[2], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 3) sscanf(argv[3], "%lf", &tol);
+  if (argc > 4) sscanf(argv[4], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 5) sscanf(argv[5], "%d", &opts.spread_sort);
+  if (argc > 6) {
+    sscanf(argv[6], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 7) sscanf(argv[7], "%lf", &errfail);
+
   cout << scientific << setprecision(15);
 
-  FLT *x = (FLT*)malloc(sizeof(FLT)*M);        // NU pts
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)   // static => non-stochastic
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = PI*randm11r(&se);   // fills [-pi,pi)
+    unsigned int se = MY_OMP_GET_THREAD_NUM();   // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK) // static => non-stochastic
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = PI * randm11r(&se);                 // fills [-pi,pi)
       c[j] = crandm11r(&se);
     }
   }
-  //for (BIGINT j=0; j<M; ++j) x[j] = 0.999 * PI*randm11();  // avoid ends
-  //for (BIGINT j=0; j<M; ++j) x[j] = PI*(2*j/(FLT)M-1);  // test a grid
+  // for (BIGINT j=0; j<M; ++j) x[j] = 0.999 * PI*randm11();  // avoid ends
+  // for (BIGINT j=0; j<M; ++j) x[j] = PI*(2*j/(FLT)M-1);  // test a grid
 
   printf("test 1d type 1:\n"); // -------------- type 1
-  CNTime timer; timer.start();
-  int ier = FINUFFT1D1(M,x,c,isign,tol,N,F,&opts);
-  //for (int j=0;j<N;++j) cout<<F[j]<<endl;
-  double t=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier = FINUFFT1D1(M, x, c, isign, tol, N, F, &opts);
+  // for (int j=0;j<N;++j) cout<<F[j]<<endl;
+  double t = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n",(long long)M,(long long)N,t,M/t);
+    printf("\t%lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", (long long)M,
+           (long long)N, t, M / t);
 
-  BIGINT nt = (BIGINT)(0.37*N);   // check arb choice of mode near the top (N/2)
-//#pragma omp declare reduction (cmplxadd:CPX:omp_out=omp_out+omp_in) initializer(omp_priv={0.0,0.0})  // only for openmp v 4.0!
-  //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft)
+  BIGINT nt = (BIGINT)(0.37 * N); // check arb choice of mode near the top (N/2)
+  // #pragma omp declare reduction (cmplxadd:CPX:omp_out=omp_out+omp_in)
+  // initializer(omp_priv={0.0,0.0})  // only for openmp v 4.0! #pragma omp parallel for
+  // schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft)
   FLT Ftr = 0.0, Fti = 0.0;
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {  // Ft += c[j] * exp(IMA*((FLT)(isign*nt))*x[j])
-    FLT co = cos(((FLT)(isign*nt))*x[j]), si = sin(((FLT)(isign*nt))*x[j]);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) { // Ft += c[j] * exp(IMA*((FLT)(isign*nt))*x[j])
+    FLT co = cos(((FLT)(isign * nt)) * x[j]), si = sin(((FLT)(isign * nt)) * x[j]);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  err = abs(Ftr+IMA*Fti - F[N/2+nt])/infnorm(N,F);
-  printf("\tone mode: rel err in F[%lld] is %.3g\n",(long long)nt,err);
-  errmax = max(err,errmax);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft1d1(M,x,c,isign,N,Ft);
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft1d: rel l2-err of result F is %.3g\n",err);
+  err = abs(Ftr + IMA * Fti - F[N / 2 + nt]) / infnorm(N, F);
+  printf("\tone mode: rel err in F[%lld] is %.3g\n", (long long)nt, err);
+  errmax = max(err, errmax);
+  if (((int64_t)M) * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft1d1(M, x, c, isign, N, Ft);
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft1d: rel l2-err of result F is %.3g\n", err);
     free(Ft);
   }
 
   printf("test 1d type 2:\n"); // -------------- type 2
- #pragma omp parallel
+#pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
   }
   timer.restart();
-  ier = FINUFFT1D2(M,x,c,isign,tol,N,F,&opts);
-  //cout<<"c:\n"; for (int j=0;j<M;++j) cout<<c[j]<<endl;
-  t=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT1D2(M, x, c, isign, tol, N, F, &opts);
+  // cout<<"c:\n"; for (int j=0;j<M;++j) cout<<c[j]<<endl;
+  t = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",(long long)N,(long long)M,t,M/t);
+    printf("\t%lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", (long long)N,
+           (long long)M, t, M / t);
 
-  BIGINT jt = M/2;          // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0, k0 = N/2;          // index shift in fk's = mag of most neg freq
-  //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct)
-  for (BIGINT m1=-k0; m1<=(N-1)/2; ++m1)
-    ct += F[m++] * exp(IMA*((FLT)(isign*m1))*x[jt]);   // crude direct
-  err = abs(ct-c[jt])/infnorm(M,c);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* ct = (CPX*)malloc(sizeof(CPX)*M);
-    dirft1d2(M,x,ct,isign,N,F);
-    err = relerrtwonorm(M,ct,c);
-    errmax = max(err,errmax);
-    printf("\tdirft1d: rel l2-err of result c is %.3g\n",err);
-    //cout<<"c/ct:\n"; for (int j=0;j<M;++j) cout<<c[j]/ct[j]<<endl;
+  BIGINT jt = M / 2;        // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m = 0, k0 = N / 2; // index shift in fk's = mag of most neg freq
+  // #pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct)
+  for (BIGINT m1 = -k0; m1 <= (N - 1) / 2; ++m1)
+    ct += F[m++] * exp(IMA * ((FLT)(isign * m1)) * x[jt]); // crude direct
+  err    = abs(ct - c[jt]) / infnorm(M, c);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] is %.3g\n", (long long)jt, err);
+  if (((int64_t)M) * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *ct = (CPX *)malloc(sizeof(CPX) * M);
+    dirft1d2(M, x, ct, isign, N, F);
+    err    = relerrtwonorm(M, ct, c);
+    errmax = max(err, errmax);
+    printf("\tdirft1d: rel l2-err of result c is %.3g\n", err);
+    // cout<<"c/ct:\n"; for (int j=0;j<M;++j) cout<<c[j]/ct[j]<<endl;
     free(ct);
   }
 
@@ -125,49 +133,55 @@ int main(int argc, char* argv[])
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) x[j] = 2.0 + PI*randm11r(&se);  // new x_j srcs
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs
   }
-  FLT* s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs
-  FLT S = (FLT)N/2;                   // choose freq range sim to type 1
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N);                          // targ freqs
+  FLT S  = (FLT)N / 2; // choose freq range sim to type 1
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) s[k] = S*(1.7 + randm11r(&se)); //S*(1.7 + k/(FLT)N); // offset
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k)
+      s[k] = S * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset
   }
   timer.restart();
-  ier = FINUFFT1D3(M,x,c,isign,tol,N,s,F,&opts);
-  t=timer.elapsedsec();
-  if (ier>0) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT1D3(M, x, c, isign, tol, N, s, F, &opts);
+  t   = timer.elapsedsec();
+  if (ier > 0) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU to %lld NU in %.3g s        \t%.3g tot NU pts/s\n",(long long)M,(long long)N,t,(M+N)/t);
+    printf("\t%lld NU to %lld NU in %.3g s        \t%.3g tot NU pts/s\n", (long long)M,
+           (long long)N, t, (M + N) / t);
 
-  BIGINT kt = N/2;          // check arbitrary choice of one targ pt
-  Ftr = 0.0;
-  Fti = 0.0;
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {  // Ft += c[j] * exp(IMA*(FLT)isign*s[kt]*x[j])
-    FLT co = cos((FLT)isign*s[kt]*x[j]), si = sin((FLT)isign*s[kt]*x[j]);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT kt = N / 2; // check arbitrary choice of one targ pt
+  Ftr       = 0.0;
+  Fti       = 0.0;
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) { // Ft += c[j] * exp(IMA*(FLT)isign*s[kt]*x[j])
+    FLT co = cos((FLT)isign * s[kt] * x[j]), si = sin((FLT)isign * s[kt] * x[j]);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  err = abs(Ftr+IMA*Fti-F[kt])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] is %.3g\n",(long long)kt,err);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft1d3(M,x,c,isign,N,s,Ft);       // writes to F
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft1d: rel l2-err of result F is %.3g\n",err);
-    //cout<<"s, F, Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<F[k]<<"\t"<<Ft[k]<<"\t"<<F[k]/Ft[k]<<endl;
+  err    = abs(Ftr + IMA * Fti - F[kt]) / infnorm(N, F);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] is %.3g\n", (long long)kt, err);
+  if (((int64_t)M) * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft1d3(M, x, c, isign, N, s, Ft);   // writes to F
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft1d: rel l2-err of result F is %.3g\n", err);
+    // cout<<"s, F, Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<"
+    // "<<F[k]<<"\t"<<Ft[k]<<"\t"<<F[k]/Ft[k]<<endl;
     free(Ft);
   }
 
-  free(x); free(c); free(F); free(s);
-  return (errmax>errfail);
+  free(x);
+  free(c);
+  free(F);
+  free(s);
+  return (errmax > errfail);
 }
diff --git a/test/finufft1dmany_test.cpp b/test/finufft1dmany_test.cpp
index 581c52c2d..e17cbb65e 100644
--- a/test/finufft1dmany_test.cpp
+++ b/test/finufft1dmany_test.cpp
@@ -4,164 +4,172 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 1d, vectorized, all 3 types, either precision.",
-  "",
-  "Usage: finufft1dmany_test ntrans Nmodes Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
-  "\teg:\tfinufft1dmany_test 100 1e3 1e4 1e-6 1 0 0 2 0.0 1e-5",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {
+    "Tester for FINUFFT in 1d, vectorized, all 3 types, either precision.",
+    "",
+    "Usage: finufft1dmany_test ntrans Nmodes Nsrc [tol [debug [spread_thread "
+    "[maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
+    "\teg:\tfinufft1dmany_test 100 1e3 1e4 1e-6 1 0 0 2 0.0 1e-5",
+    "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+    NULL};
 // Malleo 2019 based on Shih 2018. Tidied, extra args, Barnett 5/25/20 onwards
 
-int main(int argc, char* argv[])
-{   
-  BIGINT M, N;                   // M = # srcs, N = # modes
-  int ntransf;                   // # of vectors for "many" interface
-  double w, tol = 1e-6;          // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N;                // M = # srcs, N = # modes
+  int ntransf;                // # of vectors for "many" interface
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
   // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  int isign = +1;             // choose which exponential sign to test
-  if (argc<4 || argc>11) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 4 || argc > 11) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); ntransf = (int)w;
-  sscanf(argv[2],"%lf",&w); N = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); M = (BIGINT)w;
-  if (argc>4) sscanf(argv[4],"%lf",&tol);
-  if (argc>5) sscanf(argv[5],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>6) sscanf(argv[6],"%d",&opts.spread_thread);  
-  if (argc>7) sscanf(argv[7],"%d",&opts.maxbatchsize);    
-  if (argc>8) sscanf(argv[8],"%d",&opts.spread_sort);
-  if (argc>9) { sscanf(argv[9],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>10) sscanf(argv[10],"%lf",&errfail);
+  sscanf(argv[1], "%lf", &w);
+  ntransf = (int)w;
+  sscanf(argv[2], "%lf", &w);
+  N = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 4) sscanf(argv[4], "%lf", &tol);
+  if (argc > 5) sscanf(argv[5], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 6) sscanf(argv[6], "%d", &opts.spread_thread);
+  if (argc > 7) sscanf(argv[7], "%d", &opts.maxbatchsize);
+  if (argc > 8) sscanf(argv[8], "%d", &opts.spread_sort);
+  if (argc > 9) {
+    sscanf(argv[9], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 10) sscanf(argv[10], "%lf", &errfail);
 
   cout << scientific << setprecision(15);
- 
-  FLT* x = (FLT*)malloc(sizeof(FLT)*M);  // NU pts x coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf);   // mode ampls
+
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M);           // NU pts x coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
     }
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j = 0; j<ntransf*M; ++j)
-    {
-        c[j] = crandm11r(&se);
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < ntransf * M; ++j) {
+      c[j] = crandm11r(&se);
     }
   }
 
   printf("test 1d1 many vs repeated single: ------------------------------------\n");
-  CNTime timer; timer.start();
-  int ier = FINUFFT1D1MANY(ntransf,M,x,c,isign,tol,N,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT1D1MANY(ntransf, M, x, c, isign, tol, N, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU pts to %lld modes in %.3g s  \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N,ti,ntransf*M/ti);
-
-  int i = (ntransf-1);    // choose a trial to check
-  BIGINT nt1 = (BIGINT)(0.37*N);  // choose some mode index to check
-  CPX Ft = CPX(0,0), J = IMA*(FLT)isign;
-  for (BIGINT j=0; j<M; ++j)
-    Ft += c[j+i*M] * exp(J*(nt1*x[j]));   // crude direct
-  BIGINT it = N/2+nt1 ; // index in complex F as 1d array
-  err = abs(Ft-F[it+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\tone mode: rel err in F[%lld] of trans#%d is %.3g\n",
-	 (long long)nt1,i,err);
+    printf("ntr=%d: %lld NU pts to %lld modes in %.3g s  \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N, ti, ntransf * M / ti);
+
+  int i      = (ntransf - 1);      // choose a trial to check
+  BIGINT nt1 = (BIGINT)(0.37 * N); // choose some mode index to check
+  CPX Ft = CPX(0, 0), J = IMA * (FLT)isign;
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[j + i * M] * exp(J * (nt1 * x[j])); // crude direct
+  BIGINT it = N / 2 + nt1;                      // index in complex F as 1d array
+  err       = abs(Ft - F[it + i * N]) / infnorm(N, F + i * N);
+  errmax    = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld] of trans#%d is %.3g\n", (long long)nt1, i, err);
 
   // compare the result with FINUFFT1D1
   FFTW_FORGET_WISDOM();
-  CPX * F_1d1 = (CPX *)malloc(sizeof(CPX)*N*ntransf);
-  CPX * Fstart;
-  CPX * cstart;
+  CPX *F_1d1 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
+  CPX *Fstart;
+  CPX *cstart;
   timer.restart();
-  finufft_opts simpleopts = opts;    // opts just for simple interface
-  simpleopts.debug = 0;
-  simpleopts.spread_debug = 0; 
-  for(BIGINT j = 0; j < ntransf; j++){
-    Fstart = F_1d1 + j*N;
-    cstart = c + j*M;
-    FINUFFT1D1(M,x,cstart,isign,tol,N,Fstart,&simpleopts);
+  finufft_opts simpleopts = opts; // opts just for simple interface
+  simpleopts.debug        = 0;
+  simpleopts.spread_debug = 0;
+  for (BIGINT j = 0; j < ntransf; j++) {
+    Fstart = F_1d1 + j * N;
+    cstart = c + j * M;
+    FINUFFT1D1(M, x, cstart, isign, tol, N, Fstart, &simpleopts);
   }
   double t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT1D1 / T_finufft1d1many = %.3g\n", t/ti);
-  
-   // Check consistency (worst over the ntransf)
+    printf("%d of: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N, t, ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT1D1 / T_finufft1d1many = %.3g\n", t / ti);
+
+  // Check consistency (worst over the ntransf)
   double maxerror = 0.0;
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,F_1d1+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2  ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, F_1d1 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2  ) =  %.3g\n", maxerror);
   free(F_1d1);
 
-
   printf("test 1d2 many vs repeated single: ------------------------------------\n");
   FFTW_FORGET_WISDOM();
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
   }
   timer.restart();
-  ier = FINUFFT1D2MANY(ntransf, M,x,c,isign,tol,N,F,&opts);
-  //cout<<"c:\n"; for (int j=0;j<M;++j) cout<<c[j]<<endl;
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT1D2MANY(ntransf, M, x, c, isign, tol, N, F, &opts);
+  // cout<<"c:\n"; for (int j=0;j<M;++j) cout<<c[j]<<endl;
+  ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",ntransf,(long long)N,(long long)M,ti,ntransf*M/ti);
-
-  BIGINT jt = M/2;          // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0, k0 = N/2;          // index shift in fk's = mag of most neg freq
-  //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct)
-  for (BIGINT m1=-k0; m1<=(N-1)/2; ++m1)
-    ct += F[i*N + m++] * exp(IMA*((FLT)(isign*m1))*x[jt]);   // crude direct
-  err = abs(ct-c[jt + i*M])/infnorm(M,c+i*M);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err);
+    printf("ntr=%d: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)N, (long long)M, ti, ntransf * M / ti);
+
+  BIGINT jt = M / 2;        // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m = 0, k0 = N / 2; // index shift in fk's = mag of most neg freq
+  // #pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct)
+  for (BIGINT m1 = -k0; m1 <= (N - 1) / 2; ++m1)
+    ct += F[i * N + m++] * exp(IMA * ((FLT)(isign * m1)) * x[jt]); // crude direct
+  err    = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err);
 
   // check against single calls to FINUFFT1D2...
   FFTW_FORGET_WISDOM();
-  CPX * c_1d2 = (CPX *)malloc(sizeof(CPX)*M*ntransf);
+  CPX *c_1d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf);
   timer.restart();
-  for(BIGINT j = 0; j < ntransf; j++){
-    Fstart = F + j*N;
-    cstart = c_1d2 + j*M;
-    FINUFFT1D2(M,x,cstart,isign,tol,N,Fstart,&simpleopts);
+  for (BIGINT j = 0; j < ntransf; j++) {
+    Fstart = F + j * N;
+    cstart = c_1d2 + j * M;
+    FINUFFT1D2(M, x, cstart, isign, tol, N, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N,(long long)M,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT1D2 / T_finufft1d2many = %.3g\n", t/ti);
-  
-  maxerror = 0.0;           // worst error over the ntransf
+    printf("%d of: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)N, (long long)M, t, ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT1D2 / T_finufft1d2many = %.3g\n", t / ti);
+
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(M,c_1d2+k*M,c+k*M));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(M, c_1d2 + k * M, c + k * M));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n", maxerror);
   free(c_1d2);
 
   printf("test 1d3 many vs repeated single: ------------------------------------\n");
@@ -169,68 +177,69 @@ int main(int argc, char* argv[])
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) x[j] = 2.0 + PI*randm11r(&se);  // new x_j srcs
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs
   }
-  FLT* s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs
-  FLT S = (FLT)N/2;                   // choose freq range sim to type 1
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N);                          // targ freqs
+  FLT S  = (FLT)N / 2; // choose freq range sim to type 1
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k)
-      s[k] = S*(1.7 + randm11r(&se)); //S*(1.7 + k/(FLT)N); // offset
-  
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j = 0; j<ntransf*M; ++j) 
-        c[j] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k)
+      s[k] = S * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset
+
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < ntransf * M; ++j) c[j] = crandm11r(&se);
   }
-  
+
   timer.restart();
-  ier = FINUFFT1D3MANY(ntransf, M,x,c,isign,tol,N,s,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT1D3MANY(ntransf, M, x, c, isign, tol, N, s, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n",ntransf,(long long)M,(long long)N,ti,ntransf*(M+N)/ti);
-  
-  BIGINT kt = N/4;          // check arbitrary choice of one targ pt
-  Ft = CPX(0,0);
-  //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft)
-  for (BIGINT j=0;j<M;++j)
-    Ft += c[j+i*M] * exp(IMA*(FLT)isign*s[kt]*x[j]);
-  err = abs(Ft-F[kt+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
+    printf("ntr=%d: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, ti, ntransf * (M + N) / ti);
+
+  BIGINT kt = N / 4; // check arbitrary choice of one targ pt
+  Ft        = CPX(0, 0);
+  // #pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft)
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[j + i * M] * exp(IMA * (FLT)isign * s[kt] * x[j]);
+  err    = abs(Ft - F[kt + i * N]) / infnorm(N, F + i * N);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
 
   // compare the result with single calls to FINUFFT1D3...
   FFTW_FORGET_WISDOM();
-  CPX *f_1d3 = (CPX *)malloc(sizeof(CPX)*N*ntransf);
+  CPX *f_1d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for(int k = 0; k < ntransf; k++){
-    cstart = c + k*M;
-    Fstart = f_1d3 + k*N;
-    ier = FINUFFT1D3(M,x,cstart,isign,tol,N,s,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; k++) {
+    cstart = c + k * M;
+    Fstart = f_1d3 + k * N;
+    ier    = FINUFFT1D3(M, x, cstart, isign, tol, N, s, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t);
-  printf("\t\t\tspeedup \t T_FINUFFT1D3 / T_finufft1d3many = %.3g\n", t/ti);
+    printf("%d of: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, t, ntransf * (M + N) / t);
+  printf("\t\t\tspeedup \t T_FINUFFT1D3 / T_finufft1d3many = %.3g\n", t / ti);
 
-  maxerror = 0.0;           // worst error over the ntransf
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,f_1d3+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, f_1d3 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n", maxerror);
   free(f_1d3);
   free(x);
   free(s);
   free(c);
   free(F);
-  return (errmax>errfail);
-}  
+  return (errmax > errfail);
+}
diff --git a/test/finufft2d_test.cpp b/test/finufft2d_test.cpp
index 04945b5f9..5c053dc8e 100644
--- a/test/finufft2d_test.cpp
+++ b/test/finufft2d_test.cpp
@@ -4,120 +4,129 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 2d, all 3 types, either precision.",
-  "",
-  "Usage: finufft2d_test Nmodes1 Nmodes2 Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]",
-  "\teg:\tfinufft2d_test 1000 1000 1000000 1e-12 1 2 2.0 1e-11",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {"Tester for FINUFFT in 2d, all 3 types, either precision.",
+                      "",
+                      "Usage: finufft2d_test Nmodes1 Nmodes2 Nsrc [tol [debug "
+                      "[spread_sort [upsampfac [errfail]]]]]",
+                      "\teg:\tfinufft2d_test 1000 1000 1000000 1e-12 1 2 2.0 1e-11",
+                      "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+                      NULL};
 // Barnett 2/1/17 onwards
 
-int main(int argc, char* argv[])
-{
-  BIGINT M, N1, N2;              // M = # srcs, N1,N2 = # modes
-  double w, tol = 1e-6;          // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N1, N2;           // M = # srcs, N1,N2 = # modes
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
   // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  int isign = +1;             // choose which exponential sign to test
-  if (argc<4 || argc>9) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 4 || argc > 9) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[2],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); M = (BIGINT)w;
-  if (argc>4) sscanf(argv[4],"%lf",&tol);
-  if (argc>5) sscanf(argv[5],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>6) sscanf(argv[6],"%d",&opts.spread_sort);
-  if (argc>7) { sscanf(argv[7],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>8) sscanf(argv[8],"%lf",&errfail);
-  
+  sscanf(argv[1], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[2], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 4) sscanf(argv[4], "%lf", &tol);
+  if (argc > 5) sscanf(argv[5], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 6) sscanf(argv[6], "%d", &opts.spread_sort);
+  if (argc > 7) {
+    sscanf(argv[7], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 8) sscanf(argv[8], "%lf", &errfail);
+
   cout << scientific << setprecision(15);
-  BIGINT N = N1*N2;
+  BIGINT N = N1 * N2;
 
-  FLT *x = (FLT *)malloc(sizeof(FLT)*M);        // NU pts x coords
-  FLT *y = (FLT *)malloc(sizeof(FLT)*M);        // NU pts y coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords
+  FLT *y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      y[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      y[j] = M_PI * randm11r(&se);
       c[j] = crandm11r(&se);
     }
   }
 
   printf("test 2d type 1:\n"); // -------------- type 1
-  CNTime timer; timer.start();
-  int ier = FINUFFT2D1(M,x,y,c,isign,tol,N1,N2,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT2D1(M, x, y, c, isign, tol, N1, N2, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
-	   (long long)M,(long long)N1,(long long)N2,ti,M/ti);
+    printf("\t%lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", (long long)M,
+           (long long)N1, (long long)N2, ti, M / ti);
 
-  BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2);  // choose some mode index to check
-  FLT Ftr=0, Fti=0;               // crude direct...
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {    // Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]))
-    FLT z=(FLT)isign*(nt1*x[j]+nt2*y[j]), co=cos(z), si=sin(z);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT nt1 = (BIGINT)(0.37 * N1), nt2 = (BIGINT)(0.26 * N2); // choose some mode index
+                                                               // to check
+  FLT Ftr = 0, Fti = 0;                                        // crude direct...
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) {            // Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]))
+    FLT z = (FLT)isign * (nt1 * x[j] + nt2 * y[j]), co = cos(z), si = sin(z);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  BIGINT it = N1/2+nt1 + N1*(N2/2+nt2);   // index in complex F as 1d array
-  err = abs(Ftr+IMA*Fti - F[it])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone mode: rel err in F[%lld,%lld] is %.3g\n",(long long)nt1,(long long)nt2,err);
-  if ((int64_t)M*N<=TEST_BIGPROB) {                   // also check vs full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft2d1(M,x,y,c,isign,N1,N2,Ft);
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft2d: rel l2-err of result F is %.3g\n",err);
+  BIGINT it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+  err       = abs(Ftr + IMA * Fti - F[it]) / infnorm(N, F);
+  errmax    = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld,%lld] is %.3g\n", (long long)nt1, (long long)nt2,
+         err);
+  if ((int64_t)M * N <= TEST_BIGPROB) { // also check vs full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft2d1(M, x, y, c, isign, N1, N2, Ft);
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft2d: rel l2-err of result F is %.3g\n", err);
     free(Ft);
   }
 
   printf("test 2d type 2:\n"); // -------------- type 2
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
   }
   timer.restart();
-  ier = FINUFFT2D2(M,x,y,c,isign,tol,N1,N2,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT2D2(M, x, y, c, isign, tol, N1, N2, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t(%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",(long long)N1,(long long)N2,(long long)M,ti,M/ti);
+    printf("\t(%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           (long long)N1, (long long)N2, (long long)M, ti, M / ti);
 
-  BIGINT jt = M/2;          // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0;
-  for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
-    for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1)
-      ct += F[m++] * exp(IMA*(FLT)isign*(m1*x[jt] + m2*y[jt])); // crude direct
-  err = abs(ct-c[jt])/infnorm(M,c);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err);
-  if ((int64_t)M*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* ct = (CPX*)malloc(sizeof(CPX)*M);
-    dirft2d2(M,x,y,ct,isign,N1,N2,F);
-    err = relerrtwonorm(M,ct,c);
-    errmax = max(err,errmax);
-    printf("\tdirft2d: rel l2-err of result c is %.3g\n",err);
-    //cout<<"c,ct:\n"; for (int j=0;j<M;++j) cout<<c[j]<<"\t"<<ct[j]<<endl;
+  BIGINT jt = M / 2; // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m  = 0;
+  for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+    for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+      ct += F[m++] * exp(IMA * (FLT)isign * (m1 * x[jt] + m2 * y[jt])); // crude direct
+  err    = abs(ct - c[jt]) / infnorm(M, c);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] is %.3g\n", (long long)jt, err);
+  if ((int64_t)M * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *ct = (CPX *)malloc(sizeof(CPX) * M);
+    dirft2d2(M, x, y, ct, isign, N1, N2, F);
+    err    = relerrtwonorm(M, ct, c);
+    errmax = max(err, errmax);
+    printf("\tdirft2d: rel l2-err of result c is %.3g\n", err);
+    // cout<<"c,ct:\n"; for (int j=0;j<M;++j) cout<<c[j]<<"\t"<<ct[j]<<endl;
     free(ct);
   }
 
@@ -125,56 +134,63 @@ int main(int argc, char* argv[])
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = 2.0 + M_PI*randm11r(&se);      // new x_j srcs, offset from origin
-      y[j] = -3.0 + M_PI*randm11r(&se);     // " y_j
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
     }
   }
-  FLT* s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-  FLT* t = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-  FLT S1 = (FLT)N1/2;                   // choose freq range sim to type 1
-  FLT S2 = (FLT)N2/2;
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+  FLT *t = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+  FLT S1 = (FLT)N1 / 2;                    // choose freq range sim to type 1
+  FLT S2 = (FLT)N2 / 2;
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) {
-      s[k] = S1*(1.7 + randm11r(&se));    //S*(1.7 + k/(FLT)N); // offset the freqs
-      t[k] = S2*(-0.5 + randm11r(&se));
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k) {
+      s[k] = S1 * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset the freqs
+      t[k] = S2 * (-0.5 + randm11r(&se));
     }
   }
   timer.restart();
-  ier = FINUFFT2D3(M,x,y,c,isign,tol,N,s,t,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT2D3(M, x, y, c, isign, tol, N, s, t, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n",(long long)M,(long long)N,ti,(M+N)/ti);
+    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n", (long long)M,
+           (long long)N, ti, (M + N) / ti);
 
-  BIGINT kt = N/2;          // check arbitrary choice of one targ pt
-  Ftr=0, Fti=0;                 // crude direct...
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {  // Ft += exp(IMA*(FLT)isign*(s[kt]*x[j] + t[kt]*y[j]))
-    FLT z=(FLT)isign*(s[kt]*x[j] + t[kt]*y[j]), co=cos(z), si=sin(z);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT kt = N / 2;               // check arbitrary choice of one targ pt
+  Ftr = 0, Fti = 0;                // crude direct...
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) { // Ft += exp(IMA*(FLT)isign*(s[kt]*x[j] + t[kt]*y[j]))
+    FLT z = (FLT)isign * (s[kt] * x[j] + t[kt] * y[j]), co = cos(z), si = sin(z);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  err = abs(Ftr+IMA*Fti - F[kt])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] is %.3g\n",(long long)kt,err);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft2d3(M,x,y,c,isign,N,s,t,Ft);       // writes to F
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft2d: rel l2-err of result F is %.3g\n",err);
-    //cout<<"s t, F, Ft, F/Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<t[k]<<", "<<F[k]<<",\t"<<Ft[k]<<",\t"<<F[k]/Ft[k]<<endl;
+  err    = abs(Ftr + IMA * Fti - F[kt]) / infnorm(N, F);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] is %.3g\n", (long long)kt, err);
+  if (((int64_t)M) * N <= TEST_BIGPROB) {     // also full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft2d3(M, x, y, c, isign, N, s, t, Ft); // writes to F
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft2d: rel l2-err of result F is %.3g\n", err);
+    // cout<<"s t, F, Ft, F/Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<t[k]<<",
+    // "<<F[k]<<",\t"<<Ft[k]<<",\t"<<F[k]/Ft[k]<<endl;
     free(Ft);
   }
 
-  free(x); free(y); free(c); free(F); free(s); free(t);
-  return (errmax>errfail);
+  free(x);
+  free(y);
+  free(c);
+  free(F);
+  free(s);
+  free(t);
+  return (errmax > errfail);
 }
diff --git a/test/finufft2dmany_test.cpp b/test/finufft2dmany_test.cpp
index 31b65378e..8b0f040ee 100644
--- a/test/finufft2dmany_test.cpp
+++ b/test/finufft2dmany_test.cpp
@@ -4,246 +4,262 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 2d, vectorized, all 3 types, either precision.",
-  "",
-  "Usage: finufft2dmany_test ntrans Nmodes1 Nmodes2 Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
-  "\teg:\tfinufft2dmany_test 100 1e2 1e2 1e5 1e-6 1 0 0 2 0.0 1e-5",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {
+    "Tester for FINUFFT in 2d, vectorized, all 3 types, either precision.",
+    "",
+    "Usage: finufft2dmany_test ntrans Nmodes1 Nmodes2 Nsrc [tol [debug [spread_thread "
+    "[maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
+    "\teg:\tfinufft2dmany_test 100 1e2 1e2 1e5 1e-6 1 0 0 2 0.0 1e-5",
+    "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+    NULL};
 // Melody Shih Jun 2018; Barnett removed many_seq 7/27/18. Extra args 5/21/20.
 
-int main(int argc, char* argv[])
-{
-  BIGINT M, N1, N2;              // M = # srcs, N1,N2 = # modes
-  int ntransf;                   // # of vectors for "many" interface
-  double w, tol = 1e-6;          // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N1, N2;           // M = # srcs, N1,N2 = # modes
+  int ntransf;                // # of vectors for "many" interface
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  //opts.fftw = FFTW_MEASURE;  // change from default FFTW_ESTIMATE
-  int isign = +1;                // choose which exponential sign to test
-  if (argc<5 || argc>12) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  // opts.fftw = FFTW_MEASURE;  // change from default FFTW_ESTIMATE
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 5 || argc > 12) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); ntransf = (int)w;
-  sscanf(argv[2],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[4],"%lf",&w); M = (BIGINT)w;
-  if (argc>5) sscanf(argv[5],"%lf",&tol);
-  if (argc>6) sscanf(argv[6],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>7) sscanf(argv[7],"%d",&opts.spread_thread);  
-  if (argc>8) sscanf(argv[8],"%d",&opts.maxbatchsize);  
-  if (argc>9) sscanf(argv[9],"%d",&opts.spread_sort);
-  if (argc>10) { sscanf(argv[10],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>11) sscanf(argv[11],"%lf",&errfail);
-  
+  sscanf(argv[1], "%lf", &w);
+  ntransf = (int)w;
+  sscanf(argv[2], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[4], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 5) sscanf(argv[5], "%lf", &tol);
+  if (argc > 6) sscanf(argv[6], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 7) sscanf(argv[7], "%d", &opts.spread_thread);
+  if (argc > 8) sscanf(argv[8], "%d", &opts.maxbatchsize);
+  if (argc > 9) sscanf(argv[9], "%d", &opts.spread_sort);
+  if (argc > 10) {
+    sscanf(argv[10], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 11) sscanf(argv[11], "%lf", &errfail);
+
   cout << scientific << setprecision(15);
-  BIGINT N = N1*N2;
+  BIGINT N = N1 * N2;
 
-  FLT* x = (FLT*)malloc(sizeof(FLT)*M);  // NU pts x coords
-  FLT* y = (FLT*)malloc(sizeof(FLT)*M);  // NU pts y coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M);           // NU pts x coords
+  FLT *y = (FLT *)malloc(sizeof(FLT) * M);           // NU pts y coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      y[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      y[j] = M_PI * randm11r(&se);
     }
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j = 0; j<ntransf*M; ++j)
-    {
-        c[j] = crandm11r(&se);
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < ntransf * M; ++j) {
+      c[j] = crandm11r(&se);
     }
   }
 
   printf("test 2d1 many vs repeated single: ------------------------------------\n");
-  CNTime timer; timer.start();
-  int ier = FINUFFT2D1MANY(ntransf,M,x,y,c,isign,tol,N1,N2,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT2D1MANY(ntransf, M, x, y, c, isign, tol, N1, N2, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,ti,ntransf*M/ti);
-  
-  int i = ntransf-1;    // choose a vector (transform number) to check
-  BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2);  // choose some mode index to check
-  CPX Ft = CPX(0,0), J = IMA*(FLT)isign;
-  for (BIGINT j=0; j<M; ++j)
-    Ft += c[j+i*M] * exp(J*(nt1*x[j]+nt2*y[j]));   // crude direct
-  BIGINT it = N1/2+nt1 + N1*(N2/2+nt2);   // index in complex F as 1d array
-  err = abs(Ft-F[it+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\tone mode: rel err in F[%lld,%lld] of trans#%d is %.3g\n",
-	 (long long)nt1,(long long)nt2,i,err);
+    printf("ntr=%d: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)M, (long long)N1, (long long)N2, ti, ntransf * M / ti);
+
+  int i      = ntransf - 1; // choose a vector (transform number) to check
+  BIGINT nt1 = (BIGINT)(0.37 * N1), nt2 = (BIGINT)(0.26 * N2); // choose some mode index
+                                                               // to check
+  CPX Ft = CPX(0, 0), J = IMA * (FLT)isign;
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+  BIGINT it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+  err       = abs(Ft - F[it + i * N]) / infnorm(N, F + i * N);
+  errmax    = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld,%lld] of trans#%d is %.3g\n", (long long)nt1,
+         (long long)nt2, i, err);
 
   // compare the result with FINUFFT2D1
   FFTW_FORGET_WISDOM();
   finufft_opts simpleopts = opts;
-  simpleopts.debug = 0;       // don't output timing for calls of FINUFFT2D1
+  simpleopts.debug        = 0; // don't output timing for calls of FINUFFT2D1
   simpleopts.spread_debug = 0;
 
-  CPX* cstart;
-  CPX* Fstart;
-  CPX* F_2d1 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
+  CPX *cstart;
+  CPX *Fstart;
+  CPX *F_2d1 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for (int k= 0; k<ntransf; ++k)
-  {
-    cstart = c+k*M;
-    Fstart = F_2d1+k*N;
-    ier = FINUFFT2D1(M,x,y,cstart,isign,tol,N1,N2,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    cstart = c + k * M;
+    Fstart = F_2d1 + k * N;
+    ier    = FINUFFT2D1(M, x, y, cstart, isign, tol, N1, N2, Fstart, &simpleopts);
   }
-  double t=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  double t = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT2D1 / T_finufft2d1many = %.3g\n", t/ti);
+    printf("%d of: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N1, (long long)N2, t, ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT2D1 / T_finufft2d1many = %.3g\n", t / ti);
 
   // Check consistency (worst over the ntransf)
   double maxerror = 0.0;
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,F_2d1+k*N,F+k*N));
-  errmax = max(maxerror,errmax);  
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2  ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, F_2d1 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2  ) =  %.3g\n", maxerror);
   free(F_2d1);
 
   printf("test 2d2 many vs repeated single: ------------------------------------\n");
-  
+
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N*ntransf; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N * ntransf; ++m) F[m] = crandm11r(&se);
   }
 
   FFTW_FORGET_WISDOM();
   timer.restart();
-  ier = FINUFFT2D2MANY(ntransf,M,x,y,c,isign,tol,N1,N2,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT2D2MANY(ntransf, M, x, y, c, isign, tol, N1, N2, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)M,ti,ntransf*M/ti);
+    printf("ntr=%d: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)N1, (long long)N2, (long long)M, ti, ntransf * M / ti);
 
   FFTW_FORGET_WISDOM();
-  i = ntransf-1;   // choose a data to check
-  BIGINT jt = M/2;    // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0;
-  for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
-    for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1)
-      ct += F[i*N + m++] * exp(J*(m1*x[jt] + m2*y[jt]));   // crude direct
-  err = abs(ct-c[jt+i*M])/infnorm(M,c+i*M);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err);
-  
+  i         = ntransf - 1; // choose a data to check
+  BIGINT jt = M / 2;       // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m  = 0;
+  for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+    for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+      ct += F[i * N + m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+  err    = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err);
+
   // compare the result with single calls to FINUFFT2D2...
-  CPX* c_2d2 = (CPX*)malloc(sizeof(CPX)*M*ntransf);
+  CPX *c_2d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf);
   timer.restart();
-  for (int k=0; k<ntransf; ++k)
-  {
-    cstart = c_2d2+k*M;
-    Fstart = F+k*N;
-    ier = FINUFFT2D2(M,x,y,cstart,isign,tol,N1,N2,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    cstart = c_2d2 + k * M;
+    Fstart = F + k * N;
+    ier    = FINUFFT2D2(M, x, y, cstart, isign, tol, N1, N2, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)M,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT2D2 / T_finufft2d2many = %.3g\n", t/ti);
+    printf("%d of: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)N1, (long long)N2, (long long)M, t, ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT2D2 / T_finufft2d2many = %.3g\n", t / ti);
 
-  maxerror = 0.0;           // worst error over the ntransf
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(M,c_2d2+k*M,c+k*M));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(M, c_2d2 + k * M, c + k * M));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n", maxerror);
   free(c_2d2);
 
   printf("test 2d3 many vs repeated single: ------------------------------------\n");
   FFTW_FORGET_WISDOM();
-  
+
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = 2.0 + M_PI*randm11r(&se);      // new x_j srcs, offset from origin
-      y[j] = -3.0 + M_PI*randm11r(&se);     // " y_j
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
     }
   }
-  
-  FLT* s_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-  FLT* t_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-  FLT S1 = (FLT)N1/2;                   // choose freq range sim to type 1
-  FLT S2 = (FLT)N2/2;
+
+  FLT *s_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+  FLT *t_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+  FLT S1      = (FLT)N1 / 2;                    // choose freq range sim to type 1
+  FLT S2      = (FLT)N2 / 2;
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) {
-      s_freq[k] = S1*(1.7 + randm11r(&se));    //S*(1.7 + k/(FLT)N); // offset the freqs
-      t_freq[k] = S2*(-0.5 + randm11r(&se));
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k) {
+      s_freq[k] = S1 * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset the freqs
+      t_freq[k] = S2 * (-0.5 + randm11r(&se));
     }
   }
 
   timer.restart();
-  ier = FINUFFT2D3MANY(ntransf,M,x,y,c,isign,tol,N,s_freq,t_freq,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT2D3MANY(ntransf, M, x, y, c, isign, tol, N, s_freq, t_freq, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU to %lld NU in %.3g s      \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,ti,ntransf*(M+N)/ti);
-
-  i = ntransf-1;            // choose a transform to check
-  BIGINT kt = N/4;          // check arbitrary choice of one targ pt
-  Ft = CPX(0,0);
-  for (BIGINT j=0;j<M;++j)
-    Ft += c[i*M + j] * exp(J*(s_freq[kt]*x[j] + t_freq[kt]*y[j]));
-  err = abs(Ft-F[kt+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
-
-// compare the result with FINUFFT2D3...
+    printf("ntr=%d: %lld NU to %lld NU in %.3g s      \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, ti, ntransf * (M + N) / ti);
+
+  i         = ntransf - 1; // choose a transform to check
+  BIGINT kt = N / 4;       // check arbitrary choice of one targ pt
+  Ft        = CPX(0, 0);
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[i * M + j] * exp(J * (s_freq[kt] * x[j] + t_freq[kt] * y[j]));
+  err    = abs(Ft - F[kt + i * N]) / infnorm(N, F + i * N);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
+
+  // compare the result with FINUFFT2D3...
   FFTW_FORGET_WISDOM();
-  CPX* f_2d3 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
+  CPX *f_2d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for (int k=0; k<ntransf; ++k)
-  {
-    Fstart = f_2d3+k*N;
-    cstart = c+k*M;
-    ier = FINUFFT2D3(M,x,y,cstart,isign,tol,N, s_freq,t_freq,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    Fstart = f_2d3 + k * N;
+    cstart = c + k * M;
+    ier = FINUFFT2D3(M, x, y, cstart, isign, tol, N, s_freq, t_freq, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t);
-  printf("\t\t\tspeedup \t T_FINUFFT2D3 / T_finufft2d3many = %.3g\n", t/ti);
+    printf("%d of: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, t, ntransf * (M + N) / t);
+  printf("\t\t\tspeedup \t T_FINUFFT2D3 / T_finufft2d3many = %.3g\n", t / ti);
 
-  //check against the old
-  maxerror = 0.0;           // worst error over the ntransf
+  // check against the old
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,f_2d3+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, f_2d3 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n", maxerror);
   free(f_2d3);
-  
-  free(x); free(y); free(c); free(F); free(s_freq); free(t_freq);
-  return (errmax>errfail);
+
+  free(x);
+  free(y);
+  free(c);
+  free(F);
+  free(s_freq);
+  free(t_freq);
+  return (errmax > errfail);
 }
diff --git a/test/finufft3d_test.cpp b/test/finufft3d_test.cpp
index 29dba95d0..39ee8ab6d 100644
--- a/test/finufft3d_test.cpp
+++ b/test/finufft3d_test.cpp
@@ -4,127 +4,136 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 3d, all 3 types, either precision.",
-  "",
-  "Usage: finufft3d_test Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]",
-  "\teg:\tfinufft3d_test 100 200 50 1e6 1e-12 0 2 0.0 1e-11",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {"Tester for FINUFFT in 3d, all 3 types, either precision.",
+                      "",
+                      "Usage: finufft3d_test Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug "
+                      "[spread_sort [upsampfac [errfail]]]]]",
+                      "\teg:\tfinufft3d_test 100 200 50 1e6 1e-12 0 2 0.0 1e-11",
+                      "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+                      NULL};
 // Barnett 2/2/17 onwards.
 
-int main(int argc, char* argv[])
-{
+int main(int argc, char *argv[]) {
   BIGINT M, N1, N2, N3;       // M = # srcs, N1,N2,N3 = # modes
-  double w, tol = 1e-6;       // default
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  //opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  //opts.spread_max_sp_size = 3e4; // override test
-  //opts.spread_nthr_atomic = 15;  // "
-  int isign = +1;             // choose which exponential sign to test
-  if (argc<5 || argc>10) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
+  // opts.spread_max_sp_size = 3e4; // override test
+  // opts.spread_nthr_atomic = 15;  // "
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 5 || argc > 10) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[2],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); N3 = (BIGINT)w;
-  sscanf(argv[4],"%lf",&w); M = (BIGINT)w;
-  if (argc>5) sscanf(argv[5],"%lf",&tol);
-  if (argc>6) sscanf(argv[6],"%d",&opts.debug);  // can be 0,1 or 2
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>7) sscanf(argv[7],"%d",&opts.spread_sort);
-  if (argc>8) { sscanf(argv[8],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>9) sscanf(argv[9],"%lf",&errfail);
-  
+  sscanf(argv[1], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[2], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  N3 = (BIGINT)w;
+  sscanf(argv[4], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 5) sscanf(argv[5], "%lf", &tol);
+  if (argc > 6) sscanf(argv[6], "%d", &opts.debug); // can be 0,1 or 2
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0;     // see output from spreader
+  if (argc > 7) sscanf(argv[7], "%d", &opts.spread_sort);
+  if (argc > 8) {
+    sscanf(argv[8], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 9) sscanf(argv[9], "%lf", &errfail);
+
   cout << scientific << setprecision(15);
-  BIGINT N = N1*N2*N3;
+  BIGINT N = N1 * N2 * N3;
 
-  FLT *x = (FLT *)malloc(sizeof(FLT)*M);        // NU pts x coords
-  FLT *y = (FLT *)malloc(sizeof(FLT)*M);        // NU pts y coords
-  FLT *z = (FLT *)malloc(sizeof(FLT)*M);        // NU pts z coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords
+  FLT *y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords
+  FLT *z = (FLT *)malloc(sizeof(FLT) * M); // NU pts z coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      y[j] = M_PI*randm11r(&se);
-      z[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      y[j] = M_PI * randm11r(&se);
+      z[j] = M_PI * randm11r(&se);
       c[j] = crandm11r(&se);
     }
   }
 
   printf("test 3d type 1:\n"); // -------------- type 1
-  CNTime timer; timer.start();
-  int ier = FINUFFT3D1(M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT3D1(M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
     printf("     %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
-	   (long long)M,(long long)N1,(long long)N2,(long long)N3,ti,M/ti);
+           (long long)M, (long long)N1, (long long)N2, (long long)N3, ti, M / ti);
 
-  BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2), nt3 = (BIGINT)(-0.39*N3);  // choose mode to check
-  FLT Ftr=0, Fti=0;               // crude direct...
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {    // Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]+nt3*z[j]))
-    FLT w=(FLT)isign*(nt1*x[j]+nt2*y[j]+nt3*z[j]), co=cos(w), si=sin(w);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT nt1 = (BIGINT)(0.37 * N1), nt2 = (BIGINT)(0.26 * N2),
+         nt3 = (BIGINT)(-0.39 * N3); // choose mode to check
+  FLT Ftr = 0, Fti = 0;              // crude direct...
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) {   // Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]+nt3*z[j]))
+    FLT w = (FLT)isign * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j]), co = cos(w), si = sin(w);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
   // index in complex F as 1d array...
-  BIGINT it = N1/2+nt1 + N1*(N2/2+nt2) + N1*N2*(N3/2+nt3);
-  err = abs(Ftr+IMA*Fti - F[it])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone mode: rel err in F[%lld,%lld,%lld] is %.3g\n",(long long)nt1,(long long)nt2,(long long)nt3,err);
-  if ((int64_t)M*N<=TEST_BIGPROB) {                   // also check vs full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft3d1(M,x,y,z,c,isign,N1,N2,N3,Ft);
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft3d: rel l2-err of result F is %.3g\n",err);
+  BIGINT it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3);
+  err       = abs(Ftr + IMA * Fti - F[it]) / infnorm(N, F);
+  errmax    = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld,%lld,%lld] is %.3g\n", (long long)nt1,
+         (long long)nt2, (long long)nt3, err);
+  if ((int64_t)M * N <= TEST_BIGPROB) { // also check vs full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft3d1(M, x, y, z, c, isign, N1, N2, N3, Ft);
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft3d: rel l2-err of result F is %.3g\n", err);
     free(Ft);
   }
-  
+
   printf("test 3d type 2:\n"); // -------------- type 2
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
   }
   timer.restart();
-  ier = FINUFFT3D2(M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT3D2(M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
     printf("     (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
-	   (long long)N1,(long long)N2,(long long)N3,(long long)M,ti,M/ti);
+           (long long)N1, (long long)N2, (long long)N3, (long long)M, ti, M / ti);
 
-  BIGINT jt = M/2;          // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0;
-  for (BIGINT m3=-(N3/2); m3<=(N3-1)/2; ++m3)   // loop in F order
-    for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2)
-      for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1)
-	ct += F[m++] * exp(IMA*(FLT)isign*(m1*x[jt] + m2*y[jt] + m3*z[jt]));
-  err = abs(ct-c[jt])/infnorm(M,c);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err);
-  if ((int64_t)M*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* ct = (CPX*)malloc(sizeof(CPX)*M);
-    dirft3d2(M,x,y,z,ct,isign,N1,N2,N3,F);
-    err = relerrtwonorm(M,ct,c);
-    errmax = max(err,errmax);
-    printf("\tdirft3d: rel l2-err of result c is %.3g\n",err);
+  BIGINT jt = M / 2; // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m  = 0;
+  for (BIGINT m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3) // loop in F order
+    for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2)
+      for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+        ct += F[m++] * exp(IMA * (FLT)isign * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt]));
+  err    = abs(ct - c[jt]) / infnorm(M, c);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] is %.3g\n", (long long)jt, err);
+  if ((int64_t)M * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *ct = (CPX *)malloc(sizeof(CPX) * M);
+    dirft3d2(M, x, y, z, ct, isign, N1, N2, N3, F);
+    err    = relerrtwonorm(M, ct, c);
+    errmax = max(err, errmax);
+    printf("\tdirft3d: rel l2-err of result c is %.3g\n", err);
     free(ct);
   }
 
@@ -132,60 +141,71 @@ int main(int argc, char* argv[])
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = 2.0 + M_PI*randm11r(&se);      // new x_j srcs, offset from origin
-      y[j] = -3.0 + M_PI*randm11r(&se);     // " y_j
-      z[j] = 1.0 + M_PI*randm11r(&se);      // " z_j
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
+      z[j] = 1.0 + M_PI * randm11r(&se);  // " z_j
     }
   }
-  FLT* s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-  FLT* t = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-  FLT* u = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (3-cmpt)
-  FLT S1 = (FLT)N1/2;                   // choose freq range sim to type 1
-  FLT S2 = (FLT)N2/2;
-  FLT S3 = (FLT)N3/2;
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+  FLT *t = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+  FLT *u = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (3-cmpt)
+  FLT S1 = (FLT)N1 / 2;                    // choose freq range sim to type 1
+  FLT S2 = (FLT)N2 / 2;
+  FLT S3 = (FLT)N3 / 2;
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) {
-      s[k] = S1*(1.7 + randm11r(&se));  //S*(1.7 + k/(FLT)N); // offset the freqs
-      t[k] = S2*(-0.5 + randm11r(&se));
-      u[k] = S3*(0.9 + randm11r(&se));
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k) {
+      s[k] = S1 * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset the freqs
+      t[k] = S2 * (-0.5 + randm11r(&se));
+      u[k] = S3 * (0.9 + randm11r(&se));
     }
   }
   timer.restart();
-  ier = FINUFFT3D3(M,x,y,z,c,isign,tol,N,s,t,u,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT3D3(M, x, y, z, c, isign, tol, N, s, t, u, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n",(long long)M,(long long)N,ti,(M+N)/ti);
+    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n", (long long)M,
+           (long long)N, ti, (M + N) / ti);
 
-  BIGINT kt = N/2;          // check arbitrary choice of one targ pt
-  Ftr=0, Fti=0;                 // crude direct...
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {  // Ft += c[j] * exp(IMA*(FLT)isign*(s[kt]*x[j] + t[kt]*y[j] + u[kt]*z[j]))
-    FLT w=(FLT)isign*(s[kt]*x[j]+t[kt]*y[j]+u[kt]*z[j]), co=cos(w), si=sin(w);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT kt = N / 2;               // check arbitrary choice of one targ pt
+  Ftr = 0, Fti = 0;                // crude direct...
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) { // Ft += c[j] * exp(IMA*(FLT)isign*(s[kt]*x[j] +
+                                   // t[kt]*y[j] + u[kt]*z[j]))
+    FLT w = (FLT)isign * (s[kt] * x[j] + t[kt] * y[j] + u[kt] * z[j]), co = cos(w),
+        si = sin(w);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  err = abs(Ftr+IMA*Fti - F[kt])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] is %.3g\n",(long long)kt,err);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft3d3(M,x,y,z,c,isign,N,s,t,u,Ft);       // writes to F
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft3d: rel l2-err of result F is %.3g\n",err);
-    //cout<<"s t u, F, Ft, F/Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<t[k]<<" "<<u[k]<<", "<<F[k]<<",\t"<<Ft[k]<<",\t"<<F[k]/Ft[k]<<endl;
+  err    = abs(Ftr + IMA * Fti - F[kt]) / infnorm(N, F);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] is %.3g\n", (long long)kt, err);
+  if (((int64_t)M) * N <= TEST_BIGPROB) {           // also full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft3d3(M, x, y, z, c, isign, N, s, t, u, Ft); // writes to F
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft3d: rel l2-err of result F is %.3g\n", err);
+    // cout<<"s t u, F, Ft, F/Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<t[k]<<"
+    // "<<u[k]<<", "<<F[k]<<",\t"<<Ft[k]<<",\t"<<F[k]/Ft[k]<<endl;
     free(Ft);
   }
 
-  free(x); free(y); free(z); free(c); free(F); free(s); free(t); free(u);
-  return (errmax>errfail);
+  free(x);
+  free(y);
+  free(z);
+  free(c);
+  free(F);
+  free(s);
+  free(t);
+  free(u);
+  return (errmax > errfail);
 }
diff --git a/test/finufft3dmany_test.cpp b/test/finufft3dmany_test.cpp
index d427555c3..48c1fe422 100644
--- a/test/finufft3dmany_test.cpp
+++ b/test/finufft3dmany_test.cpp
@@ -4,254 +4,283 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 3d, vectorized, all 3 types, either precision.",
-  "",
-  "Usage: finufft3dmany_test ntrans Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
-  "\teg:\tfinufft3dmany_test 100 50 50 50 1e5 1e-3 1 0 0 2 0.0 1e-2",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {
+    "Tester for FINUFFT in 3d, vectorized, all 3 types, either precision.",
+    "",
+    "Usage: finufft3dmany_test ntrans Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug "
+    "[spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
+    "\teg:\tfinufft3dmany_test 100 50 50 50 1e5 1e-3 1 0 0 2 0.0 1e-2",
+    "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+    NULL};
 // Malleo 2019 based on Shih 2018. Tidied, extra args, Barnett 5/25/20.
 
-int main(int argc, char* argv[])
-{
-  BIGINT M, N1, N2, N3;          // M = # srcs, N1,N2 = # modes
-  int ntransf;                   // # of vectors for "many" interface
-  double w, tol = 1e-6;          // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N1, N2, N3;       // M = # srcs, N1,N2 = # modes
+  int ntransf;                // # of vectors for "many" interface
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
   // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  int isign = +1;             // choose which exponential sign to test
-  if (argc<6 || argc>13) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 6 || argc > 13) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); ntransf = (int)w;
-  sscanf(argv[2],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[4],"%lf",&w); N3 = (BIGINT)w;
-  sscanf(argv[5],"%lf",&w); M = (BIGINT)w;
-  if (argc>6) sscanf(argv[6],"%lf",&tol);
-  if (argc>7) sscanf(argv[7],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>8) sscanf(argv[8],"%d",&opts.spread_thread);  
-  if (argc>9) sscanf(argv[9],"%d",&opts.maxbatchsize);  
-  if (argc>10) sscanf(argv[10],"%d",&opts.spread_sort);
-  if (argc>11) { sscanf(argv[11],"%lf",&w); opts.upsampfac = (FLT)w; }
-  if (argc>12) sscanf(argv[12],"%lf",&errfail);
+  sscanf(argv[1], "%lf", &w);
+  ntransf = (int)w;
+  sscanf(argv[2], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[4], "%lf", &w);
+  N3 = (BIGINT)w;
+  sscanf(argv[5], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 6) sscanf(argv[6], "%lf", &tol);
+  if (argc > 7) sscanf(argv[7], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 8) sscanf(argv[8], "%d", &opts.spread_thread);
+  if (argc > 9) sscanf(argv[9], "%d", &opts.maxbatchsize);
+  if (argc > 10) sscanf(argv[10], "%d", &opts.spread_sort);
+  if (argc > 11) {
+    sscanf(argv[11], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 12) sscanf(argv[12], "%lf", &errfail);
 
   cout << scientific << setprecision(15);
-  BIGINT N = N1*N2*N3;
+  BIGINT N = N1 * N2 * N3;
 
-  FLT* x = (FLT*)malloc(sizeof(FLT)*M);  // NU pts x coords
-  FLT* y = (FLT*)malloc(sizeof(FLT)*M);  // NU pts y coords
-  FLT* z = (FLT*)malloc(sizeof(FLT)*M);  // NU pts z coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M);           // NU pts x coords
+  FLT *y = (FLT *)malloc(sizeof(FLT) * M);           // NU pts y coords
+  FLT *z = (FLT *)malloc(sizeof(FLT) * M);           // NU pts z coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      y[j] = M_PI*randm11r(&se);
-      z[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      y[j] = M_PI * randm11r(&se);
+      z[j] = M_PI * randm11r(&se);
     }
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j = 0; j<ntransf*M; ++j)
-    {
-        c[j] = crandm11r(&se);
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < ntransf * M; ++j) {
+      c[j] = crandm11r(&se);
     }
   }
 
-
   printf("test 3d1 many vs repeated single: ------------------------------------\n");
-  CNTime timer; timer.start();
-  int ier = FINUFFT3D1MANY(ntransf,M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT3D1MANY(ntransf, M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2, (long long)N3, ti,ntransf*M/ti);
+    printf("ntr=%d: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)M, (long long)N1, (long long)N2, (long long)N3, ti,
+           ntransf * M / ti);
 
-  int i = ntransf-1;    // choose a data to check
-  BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2), nt3 = (BIGINT)(-0.39*N3);  // choose some mode index to check
-  CPX Ft = CPX(0,0), J = IMA*(FLT)isign;
-  for (BIGINT j=0; j<M; ++j)
-    Ft += c[j+i*M] * exp(J*(nt1*x[j]+nt2*y[j]+nt3*z[j]));   // crude direct
-  BIGINT it = N1/2+nt1 + N1*(N2/2+nt2) + N1*N2*(N3/2+nt3);   // index in complex F as 1d array
-  err = abs(Ft-F[it+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-    printf("\tone mode: rel err in F[%lld,%lld,%lld] of trans#%d is %.3g\n",
-	 (long long)nt1,(long long)nt2,(long long)nt3,i,err);
+  int i      = ntransf - 1;          // choose a data to check
+  BIGINT nt1 = (BIGINT)(0.37 * N1), nt2 = (BIGINT)(0.26 * N2),
+         nt3 = (BIGINT)(-0.39 * N3); // choose some mode index to check
+  CPX Ft = CPX(0, 0), J = IMA * (FLT)isign;
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j])); // crude direct
+  BIGINT it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3); // index in
+                                                                             // complex F
+                                                                             // as 1d
+                                                                             // array
+  err    = abs(Ft - F[it + i * N]) / infnorm(N, F + i * N);
+  errmax = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld,%lld,%lld] of trans#%d is %.3g\n", (long long)nt1,
+         (long long)nt2, (long long)nt3, i, err);
 
   // compare the result with FINUFFT3D1
   FFTW_FORGET_WISDOM();
-  finufft_opts simpleopts=opts;
-  simpleopts.debug = 0;       // don't output timing for calls of FINUFFT3D1
+  finufft_opts simpleopts = opts;
+  simpleopts.debug        = 0; // don't output timing for calls of FINUFFT3D1
   simpleopts.spread_debug = 0;
 
-  CPX* cstart;
-  CPX* Fstart;
-  CPX* F_3d1 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
+  CPX *cstart;
+  CPX *Fstart;
+  CPX *F_3d1 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for (int k= 0; k<ntransf; ++k)
-  {
-    cstart = c+k*M;
-    Fstart = F_3d1+k*N;
-    ier = FINUFFT3D1(M,x,y,z,cstart,isign,tol,N1,N2,N3,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    cstart = c + k * M;
+    Fstart = F_3d1 + k * N;
+    ier    = FINUFFT3D1(M, x, y, z, cstart, isign, tol, N1, N2, N3, Fstart, &simpleopts);
   }
-  double t=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  double t = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s  \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,(long long)N3,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT3D1 / T_finufft3d1many = %.3g\n", t/ti);
+    printf("%d of: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s  \t%.3g NU pts/s\n",
+           ntransf, (long long)M, (long long)N1, (long long)N2, (long long)N3, t,
+           ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT3D1 / T_finufft3d1many = %.3g\n", t / ti);
 
   // Check accuracy (worst over the ntransf)
   double maxerror = 0.0;
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,F_3d1+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, F_3d1 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n", maxerror);
   free(F_3d1);
 
-
   printf("test 3d2 many vs repeated single: ------------------------------------\n");
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N*ntransf; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N * ntransf; ++m) F[m] = crandm11r(&se);
   }
   FFTW_FORGET_WISDOM();
   timer.restart();
-  ier = FINUFFT3D2MANY(ntransf,M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT3D2MANY(ntransf, M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2, (long long)N3, (long long)M,ti,ntransf*M/ti);
-  
-  i = ntransf-1;      // choose a data to check
-  BIGINT jt = M/2;    // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0;
-  for(BIGINT m3=-(N3/2); m3<=(N3-1)/2; ++m3){
-    for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2){  // loop in correct order over F
-      for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1){
-	ct += F[i*N + m++] * exp(J*(m1*x[jt]+m2*y[jt]+m3*z[jt]));   // crude direct
+    printf("ntr=%d: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)N1, (long long)N2, (long long)N3, (long long)M, ti,
+           ntransf * M / ti);
+
+  i         = ntransf - 1; // choose a data to check
+  BIGINT jt = M / 2;       // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m  = 0;
+  for (BIGINT m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3) {
+    for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) { // loop in correct order over
+                                                            // F
+      for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) {
+        ct += F[i * N + m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude
+                                                                                // direct
       }
     }
   }
-  err = abs(ct-c[jt+i*M])/infnorm(M,c+i*M);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err);
+  err    = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err);
 
   FFTW_FORGET_WISDOM();
   // compare the result with FINUFFT3D2...
-  CPX* c_3d2 = (CPX*)malloc(sizeof(CPX)*M*ntransf);
+  CPX *c_3d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf);
   timer.restart();
-  for (int k=0; k<ntransf; ++k)
-  {
-    cstart = c_3d2+k*M;
-    Fstart = F+k*N;
-    ier = FINUFFT3D2(M,x,y,z,cstart,isign,tol,N1,N2,N3,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    cstart = c_3d2 + k * M;
+    Fstart = F + k * N;
+    ier    = FINUFFT3D2(M, x, y, z, cstart, isign, tol, N1, N2, N3, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)N3,(long long)M,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT3D2 / T_finufft3d2many = %.3g\n", t/ti);
+    printf("%d of: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)N1, (long long)N2, (long long)N3, (long long)M, t,
+           ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT3D2 / T_finufft3d2many = %.3g\n", t / ti);
 
-  maxerror = 0.0;           // worst error over the ntransf
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(M,c_3d2+k*M,c+k*M));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(M, c_3d2 + k * M, c + k * M));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n", maxerror);
   free(c_3d2);
 
-
   printf("test 3d3 many vs repeated single: ------------------------------------\n");
   FFTW_FORGET_WISDOM();
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = 2.0 + M_PI*randm11r(&se);      // new x_j srcs, offset from origin
-      y[j] = -3.0 + M_PI*randm11r(&se);     // " y_j
-      z[j] = 1.0 + M_PI*randm11r(&se);      // " z_j
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
+      z[j] = 1.0 + M_PI * randm11r(&se);  // " z_j
     }
-  }  
-  FLT* s_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-  FLT* t_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-  FLT* u_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (3-cmpt)
-  FLT S1 = (FLT)N1/2;                   // choose freq range sim to type 1
-  FLT S2 = (FLT)N2/2;
-  FLT S3 = (FLT)N3/2;
+  }
+  FLT *s_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+  FLT *t_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+  FLT *u_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (3-cmpt)
+  FLT S1      = (FLT)N1 / 2;                    // choose freq range sim to type 1
+  FLT S2      = (FLT)N2 / 2;
+  FLT S3      = (FLT)N3 / 2;
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) {
-      s_freq[k] = S1*(1.7 + randm11r(&se));    //S*(1.7 + k/(FLT)N); // offset the freqs
-      t_freq[k] = S2*(-0.5 + randm11r(&se));
-      u_freq[k] = S3*(0.9 + randm11r(&se));
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k) {
+      s_freq[k] = S1 * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset the freqs
+      t_freq[k] = S2 * (-0.5 + randm11r(&se));
+      u_freq[k] = S3 * (0.9 + randm11r(&se));
     }
   }
 
   timer.restart();
-  ier = FINUFFT3D3MANY(ntransf,M,x,y,z,c,isign,tol,N,s_freq,t_freq,u_freq,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT3D3MANY(ntransf, M, x, y, z, c, isign, tol, N, s_freq, t_freq, u_freq, F,
+                       &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,ti,ntransf*(M+N)/ti);
+    printf("ntr=%d: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, ti, ntransf * (M + N) / ti);
 
-  i = ntransf-1;           // choose a transform to check
-  BIGINT kt = N/4;          // check arbitrary choice of one targ pt
-  Ft = CPX(0,0);
-  for (BIGINT j=0;j<M;++j)
-    Ft += c[i*M + j] * exp(J*(s_freq[kt]*x[j] + t_freq[kt]*y[j]+ u_freq[kt]*z[j]));
-  err = abs(Ft-F[kt+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\t one targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
+  i         = ntransf - 1; // choose a transform to check
+  BIGINT kt = N / 4;       // check arbitrary choice of one targ pt
+  Ft        = CPX(0, 0);
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[i * M + j] *
+          exp(J * (s_freq[kt] * x[j] + t_freq[kt] * y[j] + u_freq[kt] * z[j]));
+  err    = abs(Ft - F[kt + i * N]) / infnorm(N, F + i * N);
+  errmax = max(err, errmax);
+  printf("\t one targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
 
   FFTW_FORGET_WISDOM();
-// compare the result with FINUFFT3D3...
-  CPX* f_3d3 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
+  // compare the result with FINUFFT3D3...
+  CPX *f_3d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for (int k=0; k<ntransf; ++k) {
-    Fstart = f_3d3+k*N;
-    cstart = c+k*M;
-    ier = FINUFFT3D3(M,x,y,z,cstart,isign,tol,N, s_freq,t_freq,u_freq,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    Fstart = f_3d3 + k * N;
+    cstart = c + k * M;
+    ier    = FINUFFT3D3(M, x, y, z, cstart, isign, tol, N, s_freq, t_freq, u_freq, Fstart,
+                        &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU to %lld NU in %.3g s   \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t);
-  printf("\t\t\tspeedup \t T_FINUFFT3D3 / T_finufft3d3many = %.3g\n", t/ti);
-  
-  maxerror = 0.0;           // worst error over the ntransf
+    printf("%d of: %lld NU to %lld NU in %.3g s   \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, t, ntransf * (M + N) / t);
+  printf("\t\t\tspeedup \t T_FINUFFT3D3 / T_finufft3d3many = %.3g\n", t / ti);
+
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,f_3d3+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, f_3d3 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n", maxerror);
   free(f_3d3);
-  
-  free(x); free(y); free(z); free(c); free(F); free(s_freq); free(t_freq); free(u_freq);
-  return (errmax>errfail);
+
+  free(x);
+  free(y);
+  free(z);
+  free(c);
+  free(F);
+  free(s_freq);
+  free(t_freq);
+  free(u_freq);
+  return (errmax > errfail);
 }
diff --git a/test/testutils.cpp b/test/testutils.cpp
index cd2cd7bef..64b5d7a0a 100644
--- a/test/testutils.cpp
+++ b/test/testutils.cpp
@@ -9,16 +9,16 @@
    and platform-indep, than having to compare the text output)
 
    Suggested compile (double/float versions):
-   g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o ../src/utils_precindep.o -o testutils -lgomp
-   g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE
+   g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o
+   ../src/utils_precindep.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp
+   -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE
 */
 
 // This switches FLT macro from double to float if SINGLE is defined, etc...
 #include <finufft/test_defs.h>
 using namespace finufft::utils;
 
-int main(int argc, char* argv[])
-{
+int main(int argc, char *argv[]) {
 #ifdef SINGLE
   printf("testutilsf started...\n");
 #else
@@ -28,35 +28,41 @@ int main(int argc, char* argv[])
   // test next235even...
   // Barnett 2/9/17, made smaller range 3/28/17. pass-fail 6/16/23
   // The true outputs from {0,1,..,99}:
-  const BIGINT next235even_true[100] = {2, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 16, 16, 16, 16, 18, 18, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 32, 32, 36, 36, 36, 36, 40, 40, 40, 40, 48, 48, 48, 48, 48, 48, 48, 48, 50, 50, 54, 54, 54, 54, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 72, 72, 72, 72, 72, 72, 72, 72, 80, 80, 80, 80, 80, 80, 80, 80, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 96, 96, 96, 96, 96, 96, 100, 100, 100};
-  for (BIGINT n=0;n<100;++n) {
+  const BIGINT next235even_true[100] = {
+      2,  2,  2,  4,  4,  6,  6,  8,  8,  10, 10, 12, 12, 16, 16, 16, 16, 18,  18,  20,
+      20, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 32, 32, 36, 36, 36, 36, 40,  40,  40,
+      40, 48, 48, 48, 48, 48, 48, 48, 48, 50, 50, 54, 54, 54, 54, 60, 60, 60,  60,  60,
+      60, 64, 64, 64, 64, 72, 72, 72, 72, 72, 72, 72, 72, 80, 80, 80, 80, 80,  80,  80,
+      80, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 96, 96, 96, 96, 96, 96, 100, 100, 100};
+  for (BIGINT n = 0; n < 100; ++n) {
     BIGINT o = next235even(n);
     BIGINT t = next235even_true[n];
-    if (o!=t) {
-      printf("next235even(%lld) =\t%lld, error should be %lld!\n",(long long)n, (long long)o, (long long)t);
+    if (o != t) {
+      printf("next235even(%lld) =\t%lld, error should be %lld!\n", (long long)n,
+             (long long)o, (long long)t);
       return 1;
     }
   }
-  
+
   // various old devel expts and comments...
-  //printf("starting huge next235even...\n");   // 1e11 takes 1 sec
-  //BIGINT n=(BIGINT)120573851963;
-  //printf("next235even(%ld) =\t%ld\n",n,next235even(n));
-  //double* a; printf("%g\n",a[0]);  // do deliberate segfault for bash debug!
+  // printf("starting huge next235even...\n");   // 1e11 takes 1 sec
+  // BIGINT n=(BIGINT)120573851963;
+  // printf("next235even(%ld) =\t%ld\n",n,next235even(n));
+  // double* a; printf("%g\n",a[0]);  // do deliberate segfault for bash debug!
 
   // test vector norms and norm difference routines... now pass-fail 6/16/23
   BIGINT M = 1e4;
   std::vector<CPX> a(M), b(M);
-  for (BIGINT j=0; j<M; ++j) {
-    a[j] = CPX(1.0,0.0);
+  for (BIGINT j = 0; j < M; ++j) {
+    a[j] = CPX(1.0, 0.0);
     b[j] = a[j];
   }
-  FLT relerr=2.0*EPSILON;    // 1 ULP, fine since 1.0 rep exactly
-  if (abs(infnorm(M,&a[0]) - 1.0) > relerr) return 1;
-  if (abs(twonorm(M,&a[0]) - sqrt((FLT)M)) > relerr*sqrt((FLT)M)) return 1;
-  b[0] = CPX(0.0,0.0);  // perturb b from a
-  if (abs(errtwonorm(M,&a[0],&b[0]) - 1.0) > relerr) return 1;
-  if (abs(sqrt((FLT)M)* relerrtwonorm(M,&a[0],&b[0]) - 1.0) > relerr) return 1;
+  FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly
+  if (abs(infnorm(M, &a[0]) - 1.0) > relerr) return 1;
+  if (abs(twonorm(M, &a[0]) - sqrt((FLT)M)) > relerr * sqrt((FLT)M)) return 1;
+  b[0] = CPX(0.0, 0.0); // perturb b from a
+  if (abs(errtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1;
+  if (abs(sqrt((FLT)M) * relerrtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1;
 
 #ifdef SINGLE
   printf("testutilsf passed.\n");