diff --git a/.clang-format b/.clang-format
index 7f798e5c0..0488353ca 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,8 +1,33 @@
 ---
-Language:        Cpp
-BasedOnStyle:  LLVM
-TabWidth:        4
-ColumnLimit:    120
-IndentWidth: 4
-AlwaysBreakTemplateDeclarations: true
+BasedOnStyle: LLVM
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: AcrossEmptyLinesAndComments
+AlignConsecutiveAssignments: Consecutive
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments:
+  Kind: Always
+  OverEmptyLines: 1
+AllowShortIfStatementsOnASingleLine: WithoutElse
+AllowShortLambdasOnASingleLine: Inline
+AllowShortLoopsOnASingleLine: true
+BreakBeforeBraces: Attach
+BreakBeforeBinaryOperators: None
+ColumnLimit: 90
+ExperimentalAutoDetectBinPacking: true
+FixNamespaceComments: true
+IndentWidth: 2
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ReflowComments: true
+PenaltyBreakComment: 1
+PenaltyBreakOpenParenthesis: 1  # modified; was 0
+SortIncludes: CaseSensitive
+SortUsingDeclarations: true
+SpacesBeforeTrailingComments: 1
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: false
+TabWidth: 2
+UseTab: Never
 ...
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..1e469ec95
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,2 @@
+# Applied clang format to the codebase
+884ba427be0c60aa3399d5ea71b0e9e3a7cbf686
\ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
index 863c17377..d89a2f1ac 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -18,6 +18,10 @@ If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 * MAX_NF increased from 1e11 to 1e12, since machines grow.
 * improved GPU python docs: migration guide; usage from cupy, numba, torch,
   pycuda. PyPI pkg still at 2.2.0beta.
+* Added a clang-format pre-commit hook to ensure consistent code style.
+  Created a .clang-format file to define the style similar to the existing style.
+  Applied clang-format to all cmake, C, C++, and CUDA code. Ignored the blame
+  using .git-blame-ignore-revs. Added a contributing.md for developers.
 
 V 2.2.0 (12/12/23)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 640fc18f3..347d2f3f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,20 +3,20 @@ cmake_minimum_required(VERSION 3.19)
 project(finufft VERSION 2.2.0 LANGUAGES C CXX)
 
 set(GNU_LIKE_FRONTENDS AppleClang Clang GNU)
-if(CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
+if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
     # Set custom compiler flags for gcc-compatible compilers
     set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -funroll-loops")
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG -funroll-loops")
-endif()
+endif ()
 
 include(CTest)
 
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64"))
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64"))
     # PowerPC arch does not have -march flag.
     set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.")
-else()
+else ()
     set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.")
-endif()
+endif ()
 set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
 set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")
 
@@ -34,16 +34,16 @@ option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfi
 option(FINUFFT_BUILD_DEVEL "Whether to build developement executables" OFF)
 # sphinx tag (don't remove): @cmake_opts_end
 
-if(FINUFFT_USE_CPU)
+if (FINUFFT_USE_CPU)
     # suppress Windows warnings about "unsafe" functions
-    if(WIN32)
+    if (WIN32)
         add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-    endif()
+    endif ()
 
     # make apple with gnu use old linker, new linker breaks, see issue #360
-    if((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+    if ((APPLE) AND (CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
         add_link_options("-ld64")
-    endif()
+    endif ()
 
     set(CPM_DOWNLOAD_VERSION 0.38.0)
     set(FFTW_VERSION 3.3.10)
@@ -51,7 +51,7 @@ if(FINUFFT_USE_CPU)
     include(cmake/setupCPM.cmake)
     include(cmake/setupFFTW.cmake)
 
-endif()
+endif ()
 
 if (FINUFFT_BUILD_MATLAB)
     # When building for matlab, we will fetch the OpenMP library used by matlab
@@ -99,21 +99,21 @@ endfunction()
 
 # Utility function to link static/dynamic lib
 function(finufft_link_test target)
-  if(FINUFFT_STATIC_LINKING)
-      target_link_libraries(${target} PRIVATE finufft_static)
-      if(FINUFFT_USE_OPENMP)
-          target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
-          if(WIN32)
-              target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
-          endif()
-      endif()
-  else()
-      target_link_libraries(${target} PRIVATE finufft)
-      if(WIN32)
-          target_compile_definitions(${target} PRIVATE FINUFFT_DLL)
-      endif()
-  endif()
-  enable_asan(${target})
+    if (FINUFFT_STATIC_LINKING)
+        target_link_libraries(${target} PRIVATE finufft_static)
+        if (FINUFFT_USE_OPENMP)
+            target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
+            if (WIN32)
+                target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
+            endif ()
+        endif ()
+    else ()
+        target_link_libraries(${target} PRIVATE finufft)
+        if (WIN32)
+            target_compile_definitions(${target} PRIVATE FINUFFT_DLL)
+        endif ()
+    endif ()
+    enable_asan(${target})
 endfunction()
 
 # Utility function to set finufft compilation options.
@@ -134,9 +134,9 @@ function(set_finufft_options target)
         target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
         # there are issues on windows with OpenMP and CMake, so we need to manually add the flags
         # otherwise there are link errors
-        if(WIN32)
+        if (WIN32)
             target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS})
-        endif()
+        endif ()
     else ()
         if (CMAKE_CXX_COMPILER_ID IN_LIST FINUFFT_GNU_LIKE_COMPILERS)
             # OpenMP disabled, suppress unknown pragma warnings to avoid spam.
@@ -148,16 +148,16 @@ function(set_finufft_options target)
     # include them since we need them for build not for install
     # trying to include them directly into the fftw and fftwf targets causes issues with
     # the latest version of cmake, so we do it here instead.
-    if ( (NOT FFTW_FOUND ) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
-        list (GET FINUFFT_FFTW_LIBRARIES 0 element)
+    if ((NOT FFTW_FOUND) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
+        list(GET FINUFFT_FFTW_LIBRARIES 0 element)
         get_property(FFTW_SOURCE_DIR TARGET ${element} PROPERTY SOURCE_DIR)
         set(FFTW_INCLUDE_DIR ${FFTW_SOURCE_DIR}/api)
         target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR})
-    endif()
+    endif ()
 
 endfunction()
 
-if(FINUFFT_USE_CPU)
+if (FINUFFT_USE_CPU)
     # Main finufft libraries
     add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
     target_compile_definitions(finufft_f32 PRIVATE SINGLE)
@@ -169,7 +169,7 @@ if(FINUFFT_USE_CPU)
     set_finufft_options(finufft_f64)
     target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES})
 
-    if(WIN32)
+    if (WIN32)
         add_library(finufft_f32_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
         target_compile_definitions(finufft_f32_dll PRIVATE SINGLE dll_EXPORTS FINUFFT_DLL)
         set_finufft_options(finufft_f32_dll)
@@ -179,20 +179,20 @@ if(FINUFFT_USE_CPU)
         target_compile_definitions(finufft_f64_dll PRIVATE dll_EXPORTS FINUFFT_DLL)
         set_finufft_options(finufft_f64_dll)
         target_link_libraries(finufft_f64_dll PUBLIC ${FINUFFT_FFTW_LIBRARIES})
-    endif()
+    endif ()
 
     add_library(finufft SHARED src/utils_precindep.cpp contrib/legendre_rule_fast.cpp)
     target_compile_definitions(finufft PRIVATE dll_EXPORTS FINUFFT_DLL)
     set_finufft_options(finufft)
-    if(NOT WIN32)
+    if (NOT WIN32)
         target_link_libraries(finufft PUBLIC finufft_f32 finufft_f64)
-    else()
+    else ()
         target_link_libraries(finufft PUBLIC finufft_f32_dll finufft_f64_dll)
-    endif()
+    endif ()
     # windows does not have a math library, so we need to exclude it
-    if(NOT WIN32)
+    if (NOT WIN32)
         target_link_libraries(finufft PUBLIC m)
-    endif()
+    endif ()
     target_include_directories(finufft PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
     target_include_directories(finufft SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
 
@@ -200,9 +200,9 @@ if(FINUFFT_USE_CPU)
     set_finufft_options(finufft_static)
     target_link_libraries(finufft_static PUBLIC finufft_f32 finufft_f64)
     # windows does not have a math library, so we need to exclude it
-    if(NOT WIN32)
+    if (NOT WIN32)
         target_link_libraries(finufft_static PUBLIC m)
-    endif()
+    endif ()
     target_include_directories(finufft_static PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
     target_include_directories(finufft_static SYSTEM INTERFACE $<INSTALL_INTERFACE:${CMAKE_INSTALL_PREFIX}/include>)
 
@@ -210,23 +210,23 @@ if(FINUFFT_USE_CPU)
     set_target_properties(finufft PROPERTIES PUBLIC_HEADER "${FINUFFT_PUBLIC_HEADERS}")
 
     list(APPEND INSTALL_TARGETS finufft finufft_static)
-endif()
-
-if(FINUFFT_USE_CUDA)
-  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    message("FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to '60;70;75;'")
-    message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
-    set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE)
-  endif()
-  enable_language(CUDA)
-  find_package(CUDAToolkit REQUIRED)
-  add_subdirectory(src/cuda)
-  if (BUILD_TESTING AND FINUFFT_BUILD_TESTS)
-    add_subdirectory(perftest/cuda)
-  endif ()
-
-  list(APPEND INSTALL_TARGETS cufinufft cufinufft_static)
-endif()
+endif ()
+
+if (FINUFFT_USE_CUDA)
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        message("FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to '60;70;75;'")
+        message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.")
+        set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE)
+    endif ()
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+    add_subdirectory(src/cuda)
+    if (BUILD_TESTING AND FINUFFT_BUILD_TESTS)
+        add_subdirectory(perftest/cuda)
+    endif ()
+
+    list(APPEND INSTALL_TARGETS cufinufft cufinufft_static)
+endif ()
 
 # Add tests defined in their own directory
 if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU)
@@ -258,27 +258,27 @@ endif ()
 include(GNUInstallDirs)
 install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
 install(FILES ${PROJECT_SOURCE_DIR}/LICENSE
-  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/licenses/finufft)
+        DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/licenses/finufft)
 if (FINUFFT_USE_CPU)
-  install(DIRECTORY ${PROJECT_SOURCE_DIR}/examples
-    DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft
-    PATTERN "CMakeLists.txt" EXCLUDE
-    PATTERN "README" EXCLUDE
-    PATTERN "examples/cuda" EXCLUDE
-  )
-  if (FINUFFT_BUILD_FORTRAN)
-    install(DIRECTORY ${PROJECT_SOURCE_DIR}/fortran/examples
-      DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/fortran
-    )
-    install(FILES ${PROJECT_SOURCE_DIR}/include/finufft.fh
-      DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    install(DIRECTORY ${PROJECT_SOURCE_DIR}/examples
+            DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft
+            PATTERN "CMakeLists.txt" EXCLUDE
+            PATTERN "README" EXCLUDE
+            PATTERN "examples/cuda" EXCLUDE
     )
-  endif()
+    if (FINUFFT_BUILD_FORTRAN)
+        install(DIRECTORY ${PROJECT_SOURCE_DIR}/fortran/examples
+                DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/fortran
+        )
+        install(FILES ${PROJECT_SOURCE_DIR}/include/finufft.fh
+                DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+        )
+    endif ()
 endif ()
 if (FINUFFT_USE_CUDA)
-  install(DIRECTORY ${PROJECT_SOURCE_DIR}/examples/cuda
-    DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/examples
-    PATTERN "README" EXCLUDE
-    PATTERN "CMakeLists.txt" EXCLUDE
-  )
-endif()
+    install(DIRECTORY ${PROJECT_SOURCE_DIR}/examples/cuda
+            DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/finufft/examples
+            PATTERN "README" EXCLUDE
+            PATTERN "CMakeLists.txt" EXCLUDE
+    )
+endif ()
diff --git a/CMakePresets.json b/CMakePresets.json
index 0dcb3a5eb..b04204500 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,163 +1,164 @@
 {
-    "version": 2,
-    "cmakeMinimumRequired": {
-        "major": 3,
-        "minor": 19,
-        "patch": 0
+  "version": 2,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 19,
+    "patch": 0
+  },
+  "configurePresets": [
+    {
+      "name": "default",
+      "binaryDir": "build/default",
+      "displayName": "Default",
+      "description": "Default release configuration (ninja)",
+      "generator": "Ninja",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "RelWithDebInfo"
+      }
     },
-    "configurePresets": [
-        {
-            "name": "default",
-            "binaryDir": "build/default",
-            "displayName": "Default",
-            "description": "Default release configuration (ninja)",
-            "generator": "Ninja",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "RelWithDebInfo"
-            }
-        },
-        {
-            "name": "ninja-multi",
-            "binaryDir": "build/ninja",
-            "displayName": "Ninja Multi-config",
-            "description": "Multi-configuration build with ninja",
-            "generator": "Ninja Multi-Config"
-        },
-        {
-            "name": "dev",
-            "binaryDir": "build/dev",
-            "displayName": "Development",
-            "description": "Development configuration (full tests and examples)",
-            "generator": "Ninja Multi-Config",
-            "cacheVariables": {
-                "FINUFFT_BUILD_TESTS": "ON",
-                "FINUFFT_BUILD_EXAMPLES": "ON",
-                "FINUFFT_BUILD_DEVEL": "ON"
-            }
-        },
-        {
-            "name": "benchmark",
-            "binaryDir": "build/benchmark",
-            "displayName": "Benchmark",
-            "description": "Benchmark release configuration (ninja)",
-            "generator": "Ninja",
-            "cacheVariables": {
-                "CMAKE_BUILD_TYPE": "RelWithDebInfo",
-                "FINUFFT_BUILD_TESTS": "ON",
-                "FINUFFT_BUILD_EXAMPLES": "ON",
-                "FINUFFT_FFTW_SUFFIX": "",
-                "FINUFFT_USE_OPENMP": "OFF"
-            }
-        },
-        {
-            "name": "manylinux",
-            "binaryDir": "build/manylinux",
-            "displayName": "manylinux",
-            "description": "Configuration for maximum binary compatibility",
-            "inherits": "default",
-            "cacheVariables": {
-                "FINUFFT_ARCH_FLAGS": "-march=x86-64 -mtune=generic -msse4"
-            }
-        },
-        {
-            "name": "singlethreaded",
-            "binaryDir": "build/singlethreaded",
-            "displayName": "singlethreaded",
-            "description": "Configuration for single-threaded build. Disables OpenMP for finufft and FFTW",
-            "inherits": "default",
-            "cacheVariables": {
-                "FINUFFT_FFTW_SUFFIX": "",
-                "FINUFFT_USE_OPENMP": "OFF"
-            }
-        },
-        {
-            "name": "icx",
-            "binaryDir": "build/icx",
-            "displayName": "Intel Compiler (llvm)",
-            "description": "Build with Intel Compiler",
-            "generator": "Ninja Multi-Config",
-            "cacheVariables": {
-                "CMAKE_C_COMPILER": "icx",
-                "CMAKE_CXX_COMPILER": "icpx",
-                "CMAKE_Fortran_COMPILER": "ifx",
-                "FINUFFT_ARCH_FLAGS": "-xHost",
-                "CMAKE_CXX_FLAGS": "-fp-model=strict"
-            }
-        },
-        {
-            "name": "icc",
-            "binaryDir": "build/icc",
-            "displayName": "Intel Compiler",
-            "description": "Build with Intel Compiler",
-            "generator": "Ninja Multi-Config",
-            "cacheVariables": {
-                "CMAKE_C_COMPILER": "icc",
-                "CMAKE_CXX_COMPILER": "icpc",
-                "CMAKE_Fortran_COMPILER": "ifort",
-                "FINUFFT_ARCH_FLAGS": "-xHost",
-                "CMAKE_CXX_FLAGS": "-fp-model=strict"
-            }
-        },
-        {
-            "name": "matlab",
-            "binaryDir": "build/matlab",
-            "displayName": "matlab",
-            "description": "Build with the matlab interface",
-            "generator": "Ninja Multi-Config",
-            "cacheVariables": {
-                "FINUFFT_FFTW_SUFFIX": "Threads",
-                "FINUFFT_BUILD_MATLAB": "ON",
-                "FINUFFT_ENABLE_SANITIZERS": "OFF"
-            }
-        }
-    ],
-    "buildPresets": [
-        {
-            "name": "default",
-            "configurePreset": "default"
-        },
-        {
-            "name": "dev",
-            "configurePreset": "dev",
-            "configuration": "RelWithDebInfo"
-        },
-        {
-            "name": "ninja-multi",
-            "configurePreset": "ninja-multi",
-            "configuration": "RelWithDebInfo"
-        },
-        {
-            "name": "manylinux",
-            "configurePreset": "manylinux"
-        },
-        {
-            "name": "singlethreaded",
-            "configurePreset": "singlethreaded"
-        },
-        {
-            "name": "icc",
-            "configurePreset": "icc",
-            "configuration": "RelWithDebInfo"
-        },
-        {
-            "name": "icx",
-            "configurePreset": "icx",
-            "configuration": "RelWithDebInfo"
-        },
-        {
-            "name": "matlab",
-            "configurePreset": "matlab",
-            "configuration": "Release"
-        }
-    ],
-    "testPresets": [
-        {
-            "name": "dev",
-            "configurePreset": "dev",
-            "configuration": "Debug",
-            "environment": {
-                "OMP_NUM_THREADS": "1"
-            }
-        }
-    ]
+    {
+      "name": "ninja-multi",
+      "binaryDir": "build/ninja",
+      "displayName": "Ninja Multi-config",
+      "description": "Multi-configuration build with ninja",
+      "generator": "Ninja Multi-Config"
+    },
+    {
+      "name": "dev",
+      "binaryDir": "build/dev",
+      "displayName": "Development",
+      "description": "Development configuration (full tests and examples)",
+      "generator": "Ninja Multi-Config",
+      "cacheVariables": {
+        "FINUFFT_BUILD_TESTS": "ON",
+        "FINUFFT_BUILD_EXAMPLES": "ON",
+        "FINUFFT_BUILD_DEVEL": "ON"
+      }
+    },
+    {
+      "name": "benchmark",
+      "binaryDir": "build/benchmark",
+      "displayName": "Benchmark",
+      "description": "Benchmark release configuration (ninja)",
+      "generator": "Ninja",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "RelWithDebInfo",
+        "FINUFFT_BUILD_TESTS": "ON",
+        "FINUFFT_BUILD_EXAMPLES": "ON",
+        "FINUFFT_FFTW_SUFFIX": "",
+        "FINUFFT_USE_OPENMP": "OFF"
+      }
+    },
+    {
+      "name": "manylinux",
+      "binaryDir": "build/manylinux",
+      "displayName": "manylinux",
+      "description": "Configuration for maximum binary compatibility",
+      "inherits": "default",
+      "cacheVariables": {
+        "FINUFFT_ARCH_FLAGS": "-march=x86-64 -mtune=generic -msse4"
+      }
+    },
+    {
+      "name": "singlethreaded",
+      "binaryDir": "build/singlethreaded",
+      "displayName": "singlethreaded",
+      "description":
+          "Configuration for single-threaded build. Disables OpenMP for finufft and FFTW",
+      "inherits": "default",
+      "cacheVariables": {
+        "FINUFFT_FFTW_SUFFIX": "",
+        "FINUFFT_USE_OPENMP": "OFF"
+      }
+    },
+    {
+      "name": "icx",
+      "binaryDir": "build/icx",
+      "displayName": "Intel Compiler (llvm)",
+      "description": "Build with Intel Compiler",
+      "generator": "Ninja Multi-Config",
+      "cacheVariables": {
+        "CMAKE_C_COMPILER": "icx",
+        "CMAKE_CXX_COMPILER": "icpx",
+        "CMAKE_Fortran_COMPILER": "ifx",
+        "FINUFFT_ARCH_FLAGS": "-xHost",
+        "CMAKE_CXX_FLAGS": "-fp-model=strict"
+      }
+    },
+    {
+      "name": "icc",
+      "binaryDir": "build/icc",
+      "displayName": "Intel Compiler",
+      "description": "Build with Intel Compiler",
+      "generator": "Ninja Multi-Config",
+      "cacheVariables": {
+        "CMAKE_C_COMPILER": "icc",
+        "CMAKE_CXX_COMPILER": "icpc",
+        "CMAKE_Fortran_COMPILER": "ifort",
+        "FINUFFT_ARCH_FLAGS": "-xHost",
+        "CMAKE_CXX_FLAGS": "-fp-model=strict"
+      }
+    },
+    {
+      "name": "matlab",
+      "binaryDir": "build/matlab",
+      "displayName": "matlab",
+      "description": "Build with the matlab interface",
+      "generator": "Ninja Multi-Config",
+      "cacheVariables": {
+        "FINUFFT_FFTW_SUFFIX": "Threads",
+        "FINUFFT_BUILD_MATLAB": "ON",
+        "FINUFFT_ENABLE_SANITIZERS": "OFF"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "default",
+      "configurePreset": "default"
+    },
+    {
+      "name": "dev",
+      "configurePreset": "dev",
+      "configuration": "RelWithDebInfo"
+    },
+    {
+      "name": "ninja-multi",
+      "configurePreset": "ninja-multi",
+      "configuration": "RelWithDebInfo"
+    },
+    {
+      "name": "manylinux",
+      "configurePreset": "manylinux"
+    },
+    {
+      "name": "singlethreaded",
+      "configurePreset": "singlethreaded"
+    },
+    {
+      "name": "icc",
+      "configurePreset": "icc",
+      "configuration": "RelWithDebInfo"
+    },
+    {
+      "name": "icx",
+      "configurePreset": "icx",
+      "configuration": "RelWithDebInfo"
+    },
+    {
+      "name": "matlab",
+      "configurePreset": "matlab",
+      "configuration": "Release"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "dev",
+      "configurePreset": "dev",
+      "configuration": "Debug",
+      "environment": {
+        "OMP_NUM_THREADS": "1"
+      }
+    }
+  ]
 }
diff --git a/contrib/legendre_rule_fast.cpp b/contrib/legendre_rule_fast.cpp
index 01b626cc3..a91119161 100644
--- a/contrib/legendre_rule_fast.cpp
+++ b/contrib/legendre_rule_fast.cpp
@@ -12,16 +12,16 @@
 #include <cstdlib>
 
 namespace finufft {
-  namespace quadrature {
-  
-void legendre_compute_glr ( int n, double x[], double w[] );
-void legendre_compute_glr0 ( int n, double *p, double *pp );
-void legendre_compute_glr1 ( int n, double *roots, double *ders );
-void legendre_compute_glr2 ( double p, int n, double *roots, double *ders );
-double rk2_leg ( double t, double tn, double x, int n );
-double ts_mult ( double *u, double h, int n );
-
-void legendre_compute_glr ( int n, double x[], double w[] )
+namespace quadrature {
+
+void legendre_compute_glr(int n, double x[], double w[]);
+void legendre_compute_glr0(int n, double *p, double *pp);
+void legendre_compute_glr1(int n, double *roots, double *ders);
+void legendre_compute_glr2(double p, int n, double *roots, double *ders);
+double rk2_leg(double t, double tn, double x, int n);
+double ts_mult(double *u, double h, int n);
+
+void legendre_compute_glr(int n, double x[], double w[])
 /******************************************************************************/
 /*
   Purpose:
@@ -30,7 +30,7 @@ void legendre_compute_glr ( int n, double x[], double w[] )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -43,8 +43,8 @@ void legendre_compute_glr ( int n, double x[], double w[] )
 
   Reference:
 
-    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, 
-    A fast algorithm for the calculation of the roots of special functions, 
+    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin,
+    A fast algorithm for the calculation of the roots of special functions,
     SIAM Journal on Scientific Computing,
     Volume 29, Number 4, pages 1420-1438, 2007.
 
@@ -61,47 +61,41 @@ void legendre_compute_glr ( int n, double x[], double w[] )
   double p;
   double pp;
   double w_sum;
-/*
-  Get the value and derivative of the N-th Legendre polynomial at 0.
-*/
-  legendre_compute_glr0 ( n, &p, &pp );
-/*
-  Either zero is a root, or we have to call a function to find the first root.
-*/  
-  if ( n % 2 == 1 )
-  {
-    x[(n-1)/2] = p;
-    w[(n-1)/2] = pp;
+  /*
+    Get the value and derivative of the N-th Legendre polynomial at 0.
+  */
+  legendre_compute_glr0(n, &p, &pp);
+  /*
+    Either zero is a root, or we have to call a function to find the first root.
+  */
+  if (n % 2 == 1) {
+    x[(n - 1) / 2] = p;
+    w[(n - 1) / 2] = pp;
+  } else {
+    legendre_compute_glr2(p, n, &x[n / 2], &w[n / 2]);
   }
-  else
-  {
-    legendre_compute_glr2 ( p, n, &x[n/2], &w[n/2] );
-  }
-/*
-  Get the complete set of roots and derivatives.
-*/
-  legendre_compute_glr1 ( n, x, w );
-/*
-  Compute the weights.
-*/
-  for ( i = 0; i < n; i++ )
-  {
-    w[i] = 2.0 / ( 1.0 - x[i] ) / ( 1.0 + x[i] ) / w[i] / w[i];
+  /*
+    Get the complete set of roots and derivatives.
+  */
+  legendre_compute_glr1(n, x, w);
+  /*
+    Compute the weights.
+  */
+  for (i = 0; i < n; i++) {
+    w[i] = 2.0 / (1.0 - x[i]) / (1.0 + x[i]) / w[i] / w[i];
   }
   w_sum = 0.0;
-  for ( i = 0; i < n; i++ )
-  {
+  for (i = 0; i < n; i++) {
     w_sum = w_sum + w[i];
   }
-  for ( i = 0; i < n; i++ )
-  {
+  for (i = 0; i < n; i++) {
     w[i] = 2.0 * w[i] / w_sum;
   }
   return;
 }
 /******************************************************************************/
 
-void legendre_compute_glr0 ( int n, double *p, double *pp )
+void legendre_compute_glr0(int n, double *p, double *pp)
 
 /******************************************************************************/
 /*
@@ -111,7 +105,7 @@ void legendre_compute_glr0 ( int n, double *p, double *pp )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -124,8 +118,8 @@ void legendre_compute_glr0 ( int n, double *p, double *pp )
 
   Reference:
 
-    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, 
-    A fast algorithm for the calculation of the roots of special functions, 
+    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin,
+    A fast algorithm for the calculation of the roots of special functions,
     SIAM Journal on Scientific Computing,
     Volume 29, Number 4, pages 1420-1438, 2007.
 
@@ -144,18 +138,17 @@ void legendre_compute_glr0 ( int n, double *p, double *pp )
   double ppm1;
   double ppm2;
 
-  pm2 = 0.0;
-  pm1 = 1.0;
+  pm2  = 0.0;
+  pm1  = 1.0;
   ppm2 = 0.0;
   ppm1 = 0.0;
 
-  for ( k = 0; k < n; k++ )
-  {
-    dk = ( double ) k;
-    *p = - dk * pm2 / ( dk + 1.0 );
-    *pp = ( ( 2.0 * dk + 1.0 ) * pm1 - dk * ppm2 ) / ( dk + 1.0 );
-    pm2 = pm1;
-    pm1 = *p;
+  for (k = 0; k < n; k++) {
+    dk   = (double)k;
+    *p   = -dk * pm2 / (dk + 1.0);
+    *pp  = ((2.0 * dk + 1.0) * pm1 - dk * ppm2) / (dk + 1.0);
+    pm2  = pm1;
+    pm1  = *p;
     ppm2 = ppm1;
     ppm1 = *pp;
   }
@@ -163,7 +156,7 @@ void legendre_compute_glr0 ( int n, double *p, double *pp )
 }
 /******************************************************************************/
 
-void legendre_compute_glr1 ( int n, double *x, double *ders )
+void legendre_compute_glr1(int n, double *x, double *ders)
 
 /******************************************************************************/
 /*
@@ -179,7 +172,7 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -192,8 +185,8 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
 
   Reference:
 
-    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, 
-    A fast algorithm for the calculation of the roots of special functions, 
+    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin,
+    A fast algorithm for the calculation of the roots of special functions,
     SIAM Journal on Scientific Computing,
     Volume 29, Number 4, pages 1420-1438, 2007.
 
@@ -202,11 +195,11 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
     Input, int N, the order of the Legendre polynomial.
 
     Input/output, double X[N].  On input, a starting value
-    has been set in one entry.  On output, the roots of the Legendre 
+    has been set in one entry.  On output, the roots of the Legendre
     polynomial.
 
     Input/output, double DERS[N].  On input, a starting value
-    has been set in one entry.  On output, the derivatives of the Legendre 
+    has been set in one entry.  On output, the derivatives of the Legendre
     polynomial at the zeros.
 
   Local Parameters:
@@ -228,27 +221,23 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
   double *up;
   double xp;
 
-  if ( n % 2 == 1 )
-  {
-    n2 = ( n - 1 ) / 2;
-    s = 1;
-  }
-  else
-  {
+  if (n % 2 == 1) {
+    n2 = (n - 1) / 2;
+    s  = 1;
+  } else {
     n2 = n / 2;
-    s = 0;
+    s  = 0;
   }
 
-  u = ( double * ) malloc ( ( m + 2 ) * sizeof ( double ) );
-  up = ( double * ) malloc ( ( m + 1 ) * sizeof ( double ) );
+  u  = (double *)malloc((m + 2) * sizeof(double));
+  up = (double *)malloc((m + 1) * sizeof(double));
 
-  dn = ( double ) n;
+  dn = (double)n;
 
-  for ( j = n2; j < n - 1; j++ )
-  {
+  for (j = n2; j < n - 1; j++) {
     xp = x[j];
 
-    h = rk2_leg ( pi/2.0, -pi/2.0, xp, n ) - xp;
+    h = rk2_leg(pi / 2.0, -pi / 2.0, xp, n) - xp;
 
     u[0] = 0.0;
     u[1] = 0.0;
@@ -257,41 +246,36 @@ void legendre_compute_glr1 ( int n, double *x, double *ders )
     up[0] = 0.0;
     up[1] = u[2];
 
-    for ( k = 0; k <= m - 2; k++ )
-    {
-      dk = ( double ) k;
+    for (k = 0; k <= m - 2; k++) {
+      dk = (double)k;
 
-      u[k+3] = 
-      ( 
-        2.0 * xp * ( dk + 1.0 ) * u[k+2]
-        + ( dk * ( dk + 1.0 ) - dn * ( dn + 1.0 ) ) * u[k+1] / ( dk + 1.0 )
-      ) / ( 1.0 - xp ) / ( 1.0 + xp ) / ( dk + 2.0 );
+      u[k + 3] = (2.0 * xp * (dk + 1.0) * u[k + 2] +
+                  (dk * (dk + 1.0) - dn * (dn + 1.0)) * u[k + 1] / (dk + 1.0)) /
+                 (1.0 - xp) / (1.0 + xp) / (dk + 2.0);
 
-      up[k+2] = ( dk + 2.0 ) * u[k+3];
+      up[k + 2] = (dk + 2.0) * u[k + 3];
     }
 
-    for ( l = 0; l < 5; l++ )
-    { 
-      h = h - ts_mult ( u, h, m ) / ts_mult ( up, h, m-1 );
+    for (l = 0; l < 5; l++) {
+      h = h - ts_mult(u, h, m) / ts_mult(up, h, m - 1);
     }
 
-    x[j+1] = xp + h;
-    ders[j+1] = ts_mult ( up, h, m-1 );
+    x[j + 1]    = xp + h;
+    ders[j + 1] = ts_mult(up, h, m - 1);
   }
 
-  free ( u );
-  free ( up );
+  free(u);
+  free(up);
 
-  for ( k = 0; k < n2 + s; k++ )
-  {
-    x[k] = - x[n-k-1];
-    ders[k] = ders[n-k-1];
+  for (k = 0; k < n2 + s; k++) {
+    x[k]    = -x[n - k - 1];
+    ders[k] = ders[n - k - 1];
   }
   return;
 }
 /******************************************************************************/
 
-void legendre_compute_glr2 ( double pn0, int n, double *x1,  double *d1 )
+void legendre_compute_glr2(double pn0, int n, double *x1, double *d1)
 
 /******************************************************************************/
 /*
@@ -308,7 +292,7 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1,  double *d1 )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -321,8 +305,8 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1,  double *d1 )
 
   Reference:
 
-    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin, 
-    A fast algorithm for the calculation of the roots of special functions, 
+    Andreas Glaser, Xiangtao Liu, Vladimir Rokhlin,
+    A fast algorithm for the calculation of the roots of special functions,
     SIAM Journal on Scientific Computing,
     Volume 29, Number 4, pages 1420-1438, 2007.
 
@@ -345,55 +329,52 @@ void legendre_compute_glr2 ( double pn0, int n, double *x1,  double *d1 )
   double dn;
   int k;
   int l;
-  int m = 30;
+  int m           = 30;
   const double pi = 3.141592653589793;
   double t;
   double *u;
   double *up;
 
-  t = 0.0;
-  *x1 = rk2_leg ( t, -pi/2.0, 0.0, n );
+  t   = 0.0;
+  *x1 = rk2_leg(t, -pi / 2.0, 0.0, n);
 
-  u = ( double * ) malloc ( ( m + 2 ) * sizeof ( double ) );
-  up = ( double * ) malloc ( ( m + 1 ) * sizeof ( double ) );
+  u  = (double *)malloc((m + 2) * sizeof(double));
+  up = (double *)malloc((m + 1) * sizeof(double));
 
-  dn = ( double ) n;
-/*
-  U[0] and UP[0] are never used.
-  U[M+1] is set, but not used, and UP[M] is set and not used.
-  What gives?
-*/
+  dn = (double)n;
+  /*
+    U[0] and UP[0] are never used.
+    U[M+1] is set, but not used, and UP[M] is set and not used.
+    What gives?
+  */
   u[0] = 0.0;
   u[1] = pn0;
 
   up[0] = 0.0;
- 
-  for ( k = 0; k <= m - 2; k = k + 2 )
-  {
-    dk = ( double ) k;
-
-    u[k+2] = 0.0;
-    u[k+3] = ( dk * ( dk + 1.0 ) - dn * ( dn + 1.0 ) ) * u[k+1]
-      / ( dk + 1.0 ) / ( dk + 2.0 );
- 
-    up[k+1] = 0.0;
-    up[k+2] = ( dk + 2.0 ) * u[k+3];
+
+  for (k = 0; k <= m - 2; k = k + 2) {
+    dk = (double)k;
+
+    u[k + 2] = 0.0;
+    u[k + 3] = (dk * (dk + 1.0) - dn * (dn + 1.0)) * u[k + 1] / (dk + 1.0) / (dk + 2.0);
+
+    up[k + 1] = 0.0;
+    up[k + 2] = (dk + 2.0) * u[k + 3];
   }
-  
-  for ( l = 0; l < 5; l++ )
-  {
-    *x1 = *x1 - ts_mult ( u, *x1, m ) / ts_mult ( up, *x1, m-1 );
+
+  for (l = 0; l < 5; l++) {
+    *x1 = *x1 - ts_mult(u, *x1, m) / ts_mult(up, *x1, m - 1);
   }
-  *d1 = ts_mult ( up, *x1, m-1 );
+  *d1 = ts_mult(up, *x1, m - 1);
 
-  free ( u );
-  free ( up) ;
+  free(u);
+  free(up);
 
   return;
 }
 /******************************************************************************/
 
-double rk2_leg ( double t1, double t2, double x, int n )
+double rk2_leg(double t1, double t2, double x, int n)
 
 /******************************************************************************/
 /*
@@ -403,7 +384,7 @@ double rk2_leg ( double t1, double t2, double x, int n )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -434,29 +415,27 @@ double rk2_leg ( double t1, double t2, double x, int n )
   double snn1;
   double t;
 
-  h = ( t2 - t1 ) / ( double ) m;
-  snn1 = sqrt ( ( double ) ( n * ( n + 1 ) ) );
+  h    = (t2 - t1) / (double)m;
+  snn1 = sqrt((double)(n * (n + 1)));
 
   t = t1;
 
-  for ( j = 0; j < m; j++ )
-  {
-    f = ( 1.0 - x ) * ( 1.0 + x );
-    k1 = - h * f / ( snn1 * sqrt ( f ) - 0.5 * x * sin ( 2.0 * t ) );
-    x = x + k1;
+  for (j = 0; j < m; j++) {
+    f  = (1.0 - x) * (1.0 + x);
+    k1 = -h * f / (snn1 * sqrt(f) - 0.5 * x * sin(2.0 * t));
+    x  = x + k1;
 
     t = t + h;
 
-    f = ( 1.0 - x ) * ( 1.0 + x );
-    k2 = - h * f / ( snn1 * sqrt ( f ) - 0.5 * x * sin ( 2.0 * t ) );   
-    x = x + 0.5 * ( k2 - k1 );
+    f  = (1.0 - x) * (1.0 + x);
+    k2 = -h * f / (snn1 * sqrt(f) - 0.5 * x * sin(2.0 * t));
+    x  = x + 0.5 * (k2 - k1);
   }
   return x;
 }
 /******************************************************************************/
 
-
-double ts_mult ( double *u, double h, int n )
+double ts_mult(double *u, double h, int n)
 
 /******************************************************************************/
 /*
@@ -470,7 +449,7 @@ double ts_mult ( double *u, double h, int n )
 
   Licensing:
 
-    This code is distributed under the GNU LGPL license. 
+    This code is distributed under the GNU LGPL license.
 
   Modified:
 
@@ -496,11 +475,10 @@ double ts_mult ( double *u, double h, int n )
   double hk;
   int k;
   double ts;
-  
+
   ts = 0.0;
   hk = 1.0;
-  for ( k = 1; k<= n; k++ )
-  {
+  for (k = 1; k <= n; k++) {
     ts = ts + u[k] * hk;
     hk = hk * h;
   }
@@ -508,5 +486,5 @@ double ts_mult ( double *u, double h, int n )
 }
 /******************************************************************************/
 
-  } // namespace
-} // namespace
+} // namespace quadrature
+} // namespace finufft
diff --git a/contrib/legendre_rule_fast.h b/contrib/legendre_rule_fast.h
index 49c5bcf13..357909f9e 100644
--- a/contrib/legendre_rule_fast.h
+++ b/contrib/legendre_rule_fast.h
@@ -2,9 +2,9 @@
 #define GAUSSQUAD_H
 
 namespace finufft {
-  namespace quadrature {
-  void legendre_compute_glr ( int n, double x[], double w[] );
-  }  // namespace
-}  // namespace
+namespace quadrature {
+void legendre_compute_glr(int n, double x[], double w[]);
+} // namespace quadrature
+} // namespace finufft
 
 #endif
diff --git a/contributing.md b/contributing.md
new file mode 100644
index 000000000..ad79e9abf
--- /dev/null
+++ b/contributing.md
@@ -0,0 +1,15 @@
+This repository is formatted according to the .clang-format in the root directory. 
+Please enable the reformatting hook before committing your changes.
+See [pre-commit](https://pre-commit.com/) for more information.
+A quick summary:
+```
+pip install pre-commit
+pre-commit install
+```
+
+We also suggest to configure your IDE to use the same formatting settings.
+
+Another suggestion is to ignore the formatting commits in your git configuration:
+```
+git config blame.ignoreRevsFile .git-blame-ignore-revs
+```
diff --git a/devel/eval_ker_expts.cpp b/devel/eval_ker_expts.cpp
index 015bb8a38..8da4a1699 100644
--- a/devel/eval_ker_expts.cpp
+++ b/devel/eval_ker_expts.cpp
@@ -3,22 +3,25 @@
 
    compile with:
 
-g++ eval_ker_expts.cpp -o eval_ker_expts -Ofast -funroll-loops -march=native; time ./eval_ker_expts
+g++ eval_ker_expts.cpp -o eval_ker_expts -Ofast -funroll-loops -march=native; time
+./eval_ker_expts
 
    Barnett 3/28/18 for JD Patel (Intel).
    Single-prec version also of interest, if faster.
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
 
 // Choose prec...
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N)
+static inline void evaluate_kernel_vector(FLT *__restrict__ ker,
+                                          const FLT *__restrict__ args, const FLT beta,
+                                          const FLT c, const int N)
 /* Evaluate kernel for a vector of N arguments.
    Can comment out either or both loops.
    The #pragra's need to be removed for icpc if -fopenmp not used.
@@ -26,33 +29,31 @@ static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __
 {
 #pragma omp simd
   for (int i = 0; i < N; i++) // Loop 1: Compute exponential arguments
-    ker[i] = beta * sqrt(1.0 - c*args[i]*args[i]);
-  //ker[i] = beta * (1.0 - c*args[i]*args[i]);   // no-sqrt version
-  
+    ker[i] = beta * sqrt(1.0 - c * args[i] * args[i]);
+    // ker[i] = beta * (1.0 - c*args[i]*args[i]);   // no-sqrt version
+
 #pragma omp simd
   for (int i = 0; i < N; i++) // Loop 2: Compute exponentials
     ker[i] = exp(ker[i]);
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;                // # of reps
-  int w=10;                         // spread width (small), needn't be mult of 4
-  FLT beta=2.3*w, c = 4.0/(w*w); // ker params
-  FLT iw = 1.0/(FLT)w;
-  FLT ans = 0.0;                 // dummy answer
+int main(int argc, char *argv[]) {
+  int M    = (int)1e7;                   // # of reps
+  int w    = 10;                         // spread width (small), needn't be mult of 4
+  FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params
+  FLT iw  = 1.0 / (FLT)w;
+  FLT ans = 0.0;                         // dummy answer
   std::vector<FLT> x(w);
   std::vector<FLT> f(w);
-  for (int i=1;i<M;++i) {
-    FLT xi = i/(FLT)M;        // dummy offset to make each rep different
-    for (int j=0;j<w;++j)           // fill a simple argument vector (cheap)
-      x[j] = -1.0 + xi + iw*j;      // note each x in [-1,1]
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j)
-      ans += f[j];                  // do something cheap to use f output
+  for (int i = 1; i < M; ++i) {
+    FLT xi = i / (FLT)M;         // dummy offset to make each rep different
+    for (int j = 0; j < w; ++j)  // fill a simple argument vector (cheap)
+      x[j] = -1.0 + xi + iw * j; // note each x in [-1,1]
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) ans += f[j]; // do something cheap to use f output
     // we don't do anything with f, but compiler hasn't figured this out :)
   }
-  printf("ans=%.15g\n",ans);
+  printf("ans=%.15g\n", ans);
   return 0;
 }
 
diff --git a/devel/eval_ker_expts2.cpp b/devel/eval_ker_expts2.cpp
index bd415e8fd..8df8ed76b 100644
--- a/devel/eval_ker_expts2.cpp
+++ b/devel/eval_ker_expts2.cpp
@@ -1,60 +1,58 @@
 /* exponential sqrt kernel eval speed tester, single-thread, trying openmp simd.
    compile with:
 
-g++-7 eval_ker_expts2.cpp -o eval_ker_expts2 -Ofast -march=native -fopt-info; time ./eval_ker_expts2 10000000
+g++-7 eval_ker_expts2.cpp -o eval_ker_expts2 -Ofast -march=native -fopt-info; time
+./eval_ker_expts2 10000000
 
 Barnett 4/23/18. See below for concls.
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
 
 // Choose prec...
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT* ker, const FLT* args, const FLT beta, const FLT c, const int N)
+static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta,
+                                          const FLT c, const int N)
 /* Evaluate kernel for a vector of N arguments.
    The #pragmas need to be removed for icpc if -fopenmp not used.
 For g++-7, this pragma (with -fopenmp) slows it down from 0.2s to 0.4s!
 THe __restrict__ on the I/O args don't matter.
  */
 {
-  //#pragma omp simd
-  for (int i = 0; i < N; i++)
-    ker[i] = exp(beta * sqrt(FLT(1.0) - c*args[i]*args[i]));
+  // #pragma omp simd
+  for (int i = 0; i < N; i++) ker[i] = exp(beta * sqrt(FLT(1.0) - c * args[i] * args[i]));
   // FLT(1.0) suggested by mreineck
 
- // slows down from 0.2s to 2.0s for w=12, unless it's at 0.4s when no effect...
-  //  for (int i = 0; i < N; i++)         
-	//   if (fabs(args[i]) >= (FLT)N/2)    // note fabs not abs!
-	       // ker[i] = 0.0;
+  // slows down from 0.2s to 2.0s for w=12, unless it's at 0.4s when no effect...
+  //  for (int i = 0; i < N; i++)
+  //   if (fabs(args[i]) >= (FLT)N/2)    // note fabs not abs!
+  // ker[i] = 0.0;
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;          // # of reps
-  if (argc>1)
-    sscanf(argv[1],"%d",&M);  // find not needed to get the 0.2 s time.
-  int w=11;        // spread width: 10 0.17s, 11 1.8s, 12 0.2s, 13 2.0s, 15 2.5s
-  //if (argc>2)                 // even including this code slows to 0.4s !!
-  //sscanf(argv[2],"%d",&w);       //  .. but speeds up w=13 from 2s to 0.4s !
-  FLT beta=2.3*w, c = 4.0/(w*w); // typ ker params
-  FLT iw = 1.0/(FLT)w;
-  FLT ans = 0.0;                 // dummy answer
+int main(int argc, char *argv[]) {
+  int M = (int)1e7;                        // # of reps
+  if (argc > 1) sscanf(argv[1], "%d", &M); // find not needed to get the 0.2 s time.
+  int w = 11; // spread width: 10 0.17s, 11 1.8s, 12 0.2s, 13 2.0s, 15 2.5s
+  // if (argc>2)                 // even including this code slows to 0.4s !!
+  // sscanf(argv[2],"%d",&w);       //  .. but speeds up w=13 from 2s to 0.4s !
+  FLT beta = 2.3 * w, c = 4.0 / (w * w); // typ ker params
+  FLT iw  = 1.0 / (FLT)w;
+  FLT ans = 0.0;                         // dummy answer
   std::vector<FLT> x(w);
   std::vector<FLT> f(w);
-  for (int i=1;i<=M;++i) { // i=0 to M-1 : 2.1s;  i=1 to M : 0.2s !!!!!
-    FLT xi = -w/(FLT)2.0 + i/(FLT)M;  // dummy offset to make each rep different
-    for (int j=0;j<w;++j)           // fill a simple argument vector (cheap)
-      x[j] = xi + (FLT)j;      // note each x in [-w/2,w/2]
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j)
-      ans += f[j];                  // do something cheap to use f output
+  for (int i = 1; i <= M; ++i) {         // i=0 to M-1 : 2.1s;  i=1 to M : 0.2s !!!!!
+    FLT xi = -w / (FLT)2.0 + i / (FLT)M; // dummy offset to make each rep different
+    for (int j = 0; j < w; ++j)          // fill a simple argument vector (cheap)
+      x[j] = xi + (FLT)j;                // note each x in [-w/2,w/2]
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) ans += f[j]; // do something cheap to use f output
   }
-  printf("ans=%.15g\n",ans);
+  printf("ans=%.15g\n", ans);
   return 0;
 }
 
diff --git a/devel/eval_ker_expts_libin_simd64.cpp b/devel/eval_ker_expts_libin_simd64.cpp
index 7f4cc3b8f..f0d6c72b9 100644
--- a/devel/eval_ker_expts_libin_simd64.cpp
+++ b/devel/eval_ker_expts_libin_simd64.cpp
@@ -5,7 +5,8 @@ Libin Lu modified. - but need ICC to see fastest version.
 
    compile with:
 
-g++-7 eval_ker_expts_libin_simd64.cpp -o eval_ker_expts_libin_simd64 -Ofast -funroll-loops -march=native -fopt-info -fopt-info-vec-missed; time ./eval_ker_expts_libin_simd64
+g++-7 eval_ker_expts_libin_simd64.cpp -o eval_ker_expts_libin_simd64 -Ofast -funroll-loops
+-march=native -fopt-info -fopt-info-vec-missed; time ./eval_ker_expts_libin_simd64
 
 Ludvig's tweak of eval_ker_expts, 3/29/18.  Can get <0.2s for M=1e7, w=12.
 Note that the range of arguments is wrong [-1,1] not [-w/2,w/2].
@@ -18,11 +19,10 @@ that correlates w/ 0.2s magic.
 
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
-
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
 
 #ifdef VCL
 // Use Agner Fog's vector class library
@@ -33,65 +33,66 @@ that correlates w/ 0.2s magic.
 
 // Choose prec...
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N)
+static inline void evaluate_kernel_vector(FLT *__restrict__ ker,
+                                          const FLT *__restrict__ args, const FLT beta,
+                                          const FLT c, const int N)
 /* Evaluate kernel for a vector of N arguments.
-*/
+ */
 {
-#ifdef VCL 
-  for (int i = 0; i < N; i+=4) // Assume w divisible by 4
+#ifdef VCL
+  for (int i = 0; i < N; i += 4) // Assume w divisible by 4
   {
     Vec4d vec;
     vec.load(args + i);
-    vec = exp(beta*sqrt(1.0 - c*vec*vec));
+    vec = exp(beta * sqrt(1.0 - c * vec * vec));
     vec.store(ker + i);
-  }  
+  }
 #else
   for (int i = 0; i < N; i++) // Straight computation, note no pragma omp simd
-    ker[i] = exp(beta * sqrt(1.0 - c*args[i]*args[i]));
+    ker[i] = exp(beta * sqrt(1.0 - c * args[i] * args[i]));
 #endif
-  
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;                // # of reps
-  int w=12;                         // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only 0.2s, in g++-7. But not in gcc 5.4.0
+int main(int argc, char *argv[]) {
+  int M = (int)1e7; // # of reps
+  int w = 12; // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12
+              // only 0.2s, in g++-7. But not in gcc 5.4.0
 
-  if (1) {   // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb
-  if (argc == 3)
-  {
-    sscanf(argv[1],"%d",&M);
-    //sscanf(argv[2],"%d",&w);  // slows down from 0.2s to 0.44s if use - why??
-  }
+  if (1) {    // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb
+    if (argc == 3) {
+      sscanf(argv[1], "%d", &M);
+      // sscanf(argv[2],"%d",&w);  // slows down from 0.2s to 0.44s if use - why??
+    }
   }
-  
-  
-  FLT beta=2.3*w, c = 4.0/(w*w); // ker params
-  FLT iw = 1.0/(FLT)w;
-  FLT ans = 0.0;                 // dummy answer
+
+  FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params
+  FLT iw  = 1.0 / (FLT)w;
+  FLT ans = 0.0;                         // dummy answer
   std::vector<FLT> x(w);
   std::vector<FLT> f(w);
   FLT xi;
   FLT tmp_val;
-  
+
 #pragma omp simd simdlen(64)
   // this pragma makes no difference on modern gcc.
-  for (int i=1;i<=M;++i) {  // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I don't understand - has to be a better way to control (assembly code?)
-    xi = i/(FLT)M;        // dummy offset to make each rep different
+  for (int i = 1; i <= M; ++i) { // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!!
+                                 // I don't understand - has to be a better way to
+                                 // control (assembly code?)
+    xi = i / (FLT)M;             // dummy offset to make each rep different
     /*
     for (int j=0;j<w;++j)           // fill a simple argument vector (cheap)
       x[j] = -1.0 + xi + iw*j;      // note each x in [-1,1]
     evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
     */
-    for (int j=0;j<w;++j){
-      tmp_val = -1.0 + xi + iw*j;
-      ans += exp(beta * sqrt(1.0 - c*tmp_val*tmp_val));
-      //ans += f[j];                  // do something cheap to use f output
+    for (int j = 0; j < w; ++j) {
+      tmp_val = -1.0 + xi + iw * j;
+      ans += exp(beta * sqrt(1.0 - c * tmp_val * tmp_val));
+      // ans += f[j];                  // do something cheap to use f output
     }
     // we don't do anything with f, but compiler hasn't figured this out :)
   }
-  printf("ans=%.15g\n",ans);
+  printf("ans=%.15g\n", ans);
   return 0;
 }
diff --git a/devel/eval_ker_expts_ludvig.cpp b/devel/eval_ker_expts_ludvig.cpp
index 2094afe5f..3879d162c 100644
--- a/devel/eval_ker_expts_ludvig.cpp
+++ b/devel/eval_ker_expts_ludvig.cpp
@@ -3,7 +3,8 @@
 
    compile with:
 
-g++-7 eval_ker_expts_ludvig.cpp -o eval_ker_expts_ludvig -Ofast -funroll-loops -march=native -fopt-info; time ./eval_ker_expts_ludvig
+g++-7 eval_ker_expts_ludvig.cpp -o eval_ker_expts_ludvig -Ofast -funroll-loops
+-march=native -fopt-info; time ./eval_ker_expts_ludvig
 
 Update: (8/8/19)
 g++-8 is less brittle - it is able to get 0.2 s runtime for i=1 or 0 start.
@@ -18,11 +19,10 @@ eval_ker_expts_ludvig.cpp:69:17: note: loop vectorized
 that correlates w/ 0.2s magic.
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
-
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
 
 #ifdef VCL
 // Use Agner Fog's vector class library
@@ -33,56 +33,56 @@ that correlates w/ 0.2s magic.
 
 // Choose prec...
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT * __restrict__ ker, const FLT * __restrict__ args, const FLT beta, const FLT c, const int N)
+static inline void evaluate_kernel_vector(FLT *__restrict__ ker,
+                                          const FLT *__restrict__ args, const FLT beta,
+                                          const FLT c, const int N)
 /* Evaluate kernel for a vector of N arguments.
-*/
+ */
 {
-#ifdef VCL 
-  for (int i = 0; i < N; i+=4) // Assume w divisible by 4
+#ifdef VCL
+  for (int i = 0; i < N; i += 4) // Assume w divisible by 4
   {
     Vec4d vec;
     vec.load(args + i);
-    vec = exp(beta*sqrt(1.0 - c*vec*vec));
+    vec = exp(beta * sqrt(1.0 - c * vec * vec));
     vec.store(ker + i);
-  }  
+  }
 #else
   for (int i = 0; i < N; i++) // Straight computation, note no pragma omp simd
-    ker[i] = exp(beta * sqrt(1.0 - c*args[i]*args[i]));
+    ker[i] = exp(beta * sqrt(1.0 - c * args[i] * args[i]));
 #endif
-  
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;                // # of reps
-  int w=12;                         // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12 only 0.2s, in g++-7. But not in gcc 5.4.0
+int main(int argc, char *argv[]) {
+  int M = (int)1e7; // # of reps
+  int w = 12; // 12, spread width (small), needn't be mult of 4, 15 takes 3.2s but 12
+              // only 0.2s, in g++-7. But not in gcc 5.4.0
 
-  if (1) {   // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb
-  if (argc == 3)
-  {
-    sscanf(argv[1],"%d",&M);
-    //sscanf(argv[2],"%d",&w);  // slows down from 0.2s to 0.44s if use - why??
+  if (1) {    // 0 makes 10x slower (2s) than 1, which is 0.2 s, for g++-7 - ahb
+    if (argc == 3) {
+      sscanf(argv[1], "%d", &M);
+      // sscanf(argv[2],"%d",&w);  // slows down from 0.2s to 0.44s if use - why??
+    }
   }
-  }
-  
-  
-  FLT beta=2.3*w, c = 4.0/(w*w); // ker params
-  FLT iw = 1.0/(FLT)w;
-  FLT ans = 0.0;                 // dummy answer
+
+  FLT beta = 2.3 * w, c = 4.0 / (w * w); // ker params
+  FLT iw  = 1.0 / (FLT)w;
+  FLT ans = 0.0;                         // dummy answer
   std::vector<FLT> x(w);
   std::vector<FLT> f(w);
-  
-  for (int i=1;i<=M;++i) {  // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!! I don't understand - has to be a better way to control (assembly code?)
-    FLT xi = i/(FLT)M;        // dummy offset to make each rep different
-    for (int j=0;j<w;++j)           // fill a simple argument vector (cheap)
-      x[j] = -1.0 + xi + iw*j;      // note each x in [-1,1]
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j)
-      ans += f[j];                  // do something cheap to use f output
+
+  for (int i = 1; i <= M; ++i) { // changing from i=1 to i=0 slows from 0.2s to 2.4s!!!!
+                                 // I don't understand - has to be a better way to
+                                 // control (assembly code?)
+    FLT xi = i / (FLT)M;         // dummy offset to make each rep different
+    for (int j = 0; j < w; ++j)  // fill a simple argument vector (cheap)
+      x[j] = -1.0 + xi + iw * j; // note each x in [-1,1]
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) ans += f[j]; // do something cheap to use f output
     // we don't do anything with f, but compiler hasn't figured this out :)
   }
-  printf("ans=%.15g\n",ans);
+  printf("ans=%.15g\n", ans);
   return 0;
 }
diff --git a/devel/foldrescale.cpp b/devel/foldrescale.cpp
index ce26ca4fd..afcc4e312 100644
--- a/devel/foldrescale.cpp
+++ b/devel/foldrescale.cpp
@@ -1,11 +1,11 @@
 #include "finufft/defs.h"
 #include <benchmark/benchmark.h>
-#include <iostream>
-#include <random>
 #include <cmath>
 #include <immintrin.h>
+#include <iostream>
+#include <random>
 // no vectorize
-//#pragma GCC optimize("no-tree-vectorize")
+// #pragma GCC optimize("no-tree-vectorize")
 /* local NU coord fold+rescale macro: does the following affine transform to x:
      when p=true:   map [-3pi,-pi) and [-pi,pi) and [pi,3pi)    each to [0,N)
      otherwise,     map [-N,0) and [0,N) and [N,2N)             each to [0,N)
@@ -16,63 +16,58 @@
    The macro wins hands-down on i7, even for modern GCC9.
    This should be done in C++ not as a macro, someday.
 */
-#define FOLDRESCALE(x, N, p) (p ?                                         \
-         (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI)) * ((FLT)M_1_2PI*N) : \
-                        (x>=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N))
+#define FOLDRESCALE(x, N, p)                                                \
+  (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) \
+     : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N))
 
+#define FOLDRESCALE04(x, N, p)                                                       \
+  (p ? ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) * FLT(N) \
+     : ((x / FLT(N)) - floor(x / FLT(N))) * FLT(N))
 
-#define FOLDRESCALE04(x, N, p) (p ? \
-   ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5)))  * FLT(N) : \
-    ((x/FLT(N))-floor(x/FLT(N)))*FLT(N))
+#define FOLDRESCALE05(x, N, p)                                                       \
+  FLT(N) * (p ? ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) \
+              : ((x / FLT(N)) - floor(x / FLT(N))))
 
-#define FOLDRESCALE05(x, N, p) FLT(N) * (p ? \
-   ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) : \
-    ((x/FLT(N))-floor(x/FLT(N))))
-
-inline __attribute__((always_inline))
-FLT foldRescale00(FLT x, BIGINT N, bool p) {
+inline __attribute__((always_inline)) FLT foldRescale00(FLT x, BIGINT N, bool p) {
   FLT result;
   FLT fN = FLT(N);
   if (p) {
     static constexpr FLT x2pi = FLT(M_1_2PI);
-    result = x * x2pi + FLT(0.5);
+    result                    = x * x2pi + FLT(0.5);
     result -= floor(result);
   } else {
     const FLT invN = FLT(1.0) / fN;
-    result = x * invN;
+    result         = x * invN;
     result -= floor(result);
   }
   return result * fN;
 }
 
-inline __attribute__((always_inline))
-FLT foldRescale01(FLT x, BIGINT N, bool p) {
-  return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N) :
-         (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
+inline __attribute__((always_inline)) FLT foldRescale01(FLT x, BIGINT N, bool p) {
+  return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N)
+           : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N);
 }
 
 template<bool p>
-inline __attribute__((always_inline))
-FLT foldRescale02(FLT x, BIGINT N) {
+inline __attribute__((always_inline)) FLT foldRescale02(FLT x, BIGINT N) {
   if constexpr (p) {
-    return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N);
+    return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N);
   } else {
-    return (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
+    return (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N);
   }
 }
 
 template<bool p>
-inline __attribute__((always_inline))
-FLT foldRescale03(FLT x, BIGINT N) {
+inline __attribute__((always_inline)) FLT foldRescale03(FLT x, BIGINT N) {
   FLT result;
   FLT fN = FLT(N);
   if constexpr (p) {
     static constexpr FLT x2pi = FLT(M_1_2PI);
-    result = std::fma(x, x2pi, FLT(0.5));
+    result                    = std::fma(x, x2pi, FLT(0.5));
     result -= floor(result);
   } else {
     const FLT invN = FLT(1.0) / fN;
-    result = x * invN;
+    result         = x * invN;
     result -= floor(result);
   }
   return result * fN;
@@ -80,14 +75,13 @@ FLT foldRescale03(FLT x, BIGINT N) {
 
 #ifdef __AVX2__
 
-inline __attribute__((always_inline))
-__m256d foldRescaleVec(__m256d x, BIGINT N) {
+inline __attribute__((always_inline)) __m256d foldRescaleVec(__m256d x, BIGINT N) {
   __m256d result;
-  __m256d fN = _mm256_set1_pd(FLT(N));
+  __m256d fN                = _mm256_set1_pd(FLT(N));
   static const __m256d x2pi = _mm256_set1_pd(FLT(M_1_2PI));
   static const __m256d half = _mm256_set1_pd(FLT(0.5));
-  result = _mm256_fmadd_pd(x, x2pi, half);
-  result = _mm256_sub_pd(result, _mm256_floor_pd(result));
+  result                    = _mm256_fmadd_pd(x, x2pi, half);
+  result                    = _mm256_sub_pd(result, _mm256_floor_pd(result));
   return _mm256_mul_pd(result, fN);
 }
 #endif
@@ -95,129 +89,123 @@ __m256d foldRescaleVec(__m256d x, BIGINT N) {
 static std::mt19937_64 gen;
 static std::uniform_real_distribution<> dis(-10, 10);
 static const auto N = std::uniform_int_distribution<>{0, 1000}(gen);
-static std::uniform_real_distribution<> disN(-N, 2*N);
-static volatile auto pirange = true;
+static std::uniform_real_distribution<> disN(-N, 2 * N);
+static volatile auto pirange    = true;
 static volatile auto notPirange = !pirange;
 
 static void BM_BASELINE(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     benchmark::DoNotOptimize(dis(gen));
   }
 }
 
 static void BM_FoldRescaleMacro(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(FOLDRESCALE(x, N, pirange));
   }
 }
 
 static void BM_FoldRescaleMacroN(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(FOLDRESCALE(x, N, notPirange));
   }
 }
 
 static void BM_FoldRescale00(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(foldRescale00(x, N, pirange));
   }
 }
 
-
 static void BM_FoldRescale00N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(foldRescale00(x, N, notPirange));
   }
 }
 
-
 static void BM_FoldRescale01(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(foldRescale01(x, N, pirange));
   }
 }
 
-
 static void BM_FoldRescale01N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(foldRescale01(x, N, notPirange));
   }
 }
 
 static void BM_FoldRescale02(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(foldRescale02<true>(x, N));
   }
 }
 
-
 static void BM_FoldRescale02N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(foldRescale02<false>(x, N));
   }
 }
 
-
 static void BM_FoldRescale03(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(foldRescale03<true>(x, N));
   }
 }
 
 static void BM_FoldRescale03N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(foldRescale03<false>(x, N));
   }
 }
 
 static void BM_FoldRescale04(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(FOLDRESCALE04(x, N, pirange));
   }
 }
 
 static void BM_FoldRescale04N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(FOLDRESCALE04(x, N, notPirange));
   }
 }
 
 static void BM_FoldRescale05(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = dis(gen);
     benchmark::DoNotOptimize(FOLDRESCALE05(x, N, pirange));
   }
 }
 
 static void BM_FoldRescale05N(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     FLT x = disN(gen);
     benchmark::DoNotOptimize(FOLDRESCALE05(x, N, notPirange));
   }
 }
 
-
 #ifdef __AVX2__
 static void BM_FoldRescaleVec(benchmark::State &state) {
-  for (auto _: state) {
+  for (auto _ : state) {
     // Generate 4 floating point numbers
     double x1 = dis(gen);
     double x2 = dis(gen);
     double x3 = dis(gen);
     double x4 = dis(gen);
-      // Pack them into an AVX vector
+    // Pack them into an AVX vector
     __m256d x = _mm256_set_pd(x1, x2, x3, x4);
     // Call the foldRescaleVec function
     benchmark::DoNotOptimize(foldRescaleVec(x, N));
@@ -225,7 +213,6 @@ static void BM_FoldRescaleVec(benchmark::State &state) {
 }
 #endif
 
-
 BENCHMARK(BM_BASELINE)->Iterations(10000000);
 BENCHMARK(BM_FoldRescaleMacro)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale00)->Iterations(1000000);
@@ -235,7 +222,7 @@ BENCHMARK(BM_FoldRescale03)->Iterations(10000000);
 BENCHMARK(BM_FoldRescale04)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale05)->Iterations(1000000);
 #ifdef __AVX2__
-BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000/4);
+BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000 / 4);
 #endif
 BENCHMARK(BM_FoldRescaleMacroN)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale00N)->Iterations(1000000);
@@ -245,7 +232,6 @@ BENCHMARK(BM_FoldRescale03N)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale04N)->Iterations(1000000);
 BENCHMARK(BM_FoldRescale05N)->Iterations(1000000);
 
-
 #ifdef __AVX2__
 void testFoldRescaleVec_avx256_vs_foldRescale00() {
   // Generate 4 floating point numbers
@@ -265,7 +251,8 @@ void testFoldRescaleVec_avx256_vs_foldRescale00() {
   for (int i = 0; i < 4; ++i) {
     double result00 = foldRescale03<true>(xVec[i], N);
     if (std::abs(1 - result00 / resultVec[i]) > 1e-14) {
-      std::cout << "input: " << xVec[i] << " result00: " << result00 << " result256: " << resultVec[i] << std::endl;
+      std::cout << "input: " << xVec[i] << " result00: " << result00
+                << " result256: " << resultVec[i] << std::endl;
       throw std::runtime_error("foldRescaleVec is not equivalent to foldRescale00");
     }
   }
@@ -273,44 +260,51 @@ void testFoldRescaleVec_avx256_vs_foldRescale00() {
 #endif
 
 void testFoldRescaleFunctions() {
-  for (bool p: {true}) {
-    for (int i = 0; i < 1024; ++i) {  // Run the test 1000 times
-      FLT x = dis(gen);
+  for (bool p : {true}) {
+    for (int i = 0; i < 1024; ++i) { // Run the test 1000 times
+      FLT x           = dis(gen);
       FLT resultMacro = FOLDRESCALE(x, N, p);
-      FLT result00 = foldRescale00(x, N, p);
-      FLT result01 = foldRescale01(x, N, p);
-      FLT result02 = p ? foldRescale02<true>(x, N) : foldRescale02<false>(x, N);
-      FLT result03 = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
-      FLT result04 = FOLDRESCALE04(x, N, p);
-      FLT result05 = FOLDRESCALE05(x, N, p);
-
-      // function that compares two floating point number with a tolerance, using relative error
+      FLT result00    = foldRescale00(x, N, p);
+      FLT result01    = foldRescale01(x, N, p);
+      FLT result02    = p ? foldRescale02<true>(x, N) : foldRescale02<false>(x, N);
+      FLT result03    = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
+      FLT result04    = FOLDRESCALE04(x, N, p);
+      FLT result05    = FOLDRESCALE05(x, N, p);
+
+      // function that compares two floating point number with a tolerance, using
+      // relative error
       auto compare = [](FLT a, FLT b) {
         return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13;
       };
 
       if (compare(resultMacro, result00)) {
-        std::cout << "resultMacro: " << resultMacro << " result00: " << result00 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result00: " << result00
+                  << std::endl;
         throw std::runtime_error("function00 is wrong");
       }
       if (compare(resultMacro, result01)) {
-        std::cout << "resultMacro: " << resultMacro << " result01: " << result01 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result01: " << result01
+                  << std::endl;
         throw std::runtime_error("function01 is wrong");
       }
       if (compare(resultMacro, result02)) {
-        std::cout << "resultMacro: " << resultMacro << " result02: " << result02 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result02: " << result02
+                  << std::endl;
         throw std::runtime_error("function02 is wrong");
       }
       if (compare(resultMacro, result03)) {
-        std::cout << "resultMacro: " << resultMacro << " result03: " << result03 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result03: " << result03
+                  << std::endl;
         throw std::runtime_error("function03 is wrong");
       }
       if (compare(resultMacro, result04)) {
-        std::cout << "resultMacro: " << resultMacro << " result04: " << result04 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result04: " << result04
+                  << std::endl;
         throw std::runtime_error("function04 is wrong");
       }
       if (compare(resultMacro, result05)) {
-        std::cout << "resultMacro: " << resultMacro << " result05: " << result05 << std::endl;
+        std::cout << "resultMacro: " << resultMacro << " result05: " << result05
+                  << std::endl;
         throw std::runtime_error("function05 is wrong");
       }
     }
@@ -324,7 +318,7 @@ class BaselineSubtractingReporter : public benchmark::ConsoleReporter {
   }
 
   void ReportRuns(const std::vector<Run> &reports) override {
-    for (const auto &run: reports) {
+    for (const auto &run : reports) {
       if (run.benchmark_name() == "BM_BASELINE") {
         baseline_time = run.cpu_accumulated_time;
       } else {
@@ -340,7 +334,7 @@ class BaselineSubtractingReporter : public benchmark::ConsoleReporter {
 };
 
 int main(int argc, char **argv) {
-  pirange = argc & 1;
+  pirange    = argc & 1;
   notPirange = !pirange;
   static std::random_device rd;
   const auto seed = rd();
diff --git a/devel/foldrescale_perf.cpp b/devel/foldrescale_perf.cpp
index 3d423cdba..a4ac38c99 100644
--- a/devel/foldrescale_perf.cpp
+++ b/devel/foldrescale_perf.cpp
@@ -4,7 +4,8 @@
 
    Compile with, eg on linux, double-prec:
 
-   g++ -O3 -funroll-loops -march=native -I../include foldrescale_perf.cpp -o foldrescale_perf
+   g++ -O3 -funroll-loops -march=native -I../include foldrescale_perf.cpp -o
+   foldrescale_perf
 
    Use -DSINGLE for single-prec
 
@@ -35,9 +36,13 @@ using namespace std::chrono;
 #endif
 
 // old coord-handling macro ------------------------------------------------
-#define RESCALE(x,N,p) (p ?                                           \
-                        (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \
-		     (x<0 ? x+N : (x>N ? x-N : x)))
+#define RESCALE(x, N, p)                                                         \
+  (p ? (x * (FLT)M_1_2PI * N +                                                   \
+        (x * (FLT)M_1_2PI * N < -N / (FLT)2.0                                    \
+             ? (FLT)1.5                                                          \
+             : (x * (FLT)M_1_2PI * N > N / (FLT)2.0 ? (FLT) - 0.5 : (FLT)0.5)) * \
+            N)                                                                   \
+     : (x < 0 ? x + N : (x > N ? x - N : x)))
 
 // function equivalent -----------------------------------------------------
 FLT foldrescale(FLT x, BIGINT N, int pirange)
@@ -48,58 +53,68 @@ FLT foldrescale(FLT x, BIGINT N, int pirange)
   // affine rescale...
   FLT z = x;
   if (pirange)
-    z = (N/(2*PI)) * (x+PI);                  // PI is (FLT)M_PI in defs.h
+    z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h
   else
     z = x;
   // fold...
-  if (z<(FLT)0.0)
+  if (z < (FLT)0.0)
     z += (FLT)N;
-  else if (z>=(FLT)N)
+  else if (z >= (FLT)N)
     z -= (FLT)N;
   return z;
-} 
+}
 
 // ==========================================================================
-int main(int argc, char* argv[])
-{
-  int M=100000000;                // default: # pts to test
-  long int N = 1000000;           // default: grid size, doesn't matter
-  
-  if (argc>1) { double w; sscanf(argv[1],"%lf",&w); M = (int)w; }
-  if (argc>2) { double w; sscanf(argv[2],"%lf",&w); N = (long int)w; }
+int main(int argc, char *argv[]) {
+  int M      = 100000000; // default: # pts to test
+  long int N = 1000000;   // default: grid size, doesn't matter
 
-  FLT sum=0.0;
+  if (argc > 1) {
+    double w;
+    sscanf(argv[1], "%lf", &w);
+    M = (int)w;
+  }
+  if (argc > 2) {
+    double w;
+    sscanf(argv[2], "%lf", &w);
+    N = (long int)w;
+  }
+
+  FLT sum     = 0.0;
   auto tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {                     // v predictable x values,
-    FLT x = (FLT)(-10.0) + i*((FLT)20.0/N);   // I hope cheap; let's see!
+  for (int i = 0; i < M; ++i) {                 // v predictable x values,
+    FLT x = (FLT)(-10.0) + i * ((FLT)20.0 / N); // I hope cheap; let's see!
     sum += x;
   }
-  duration<double> dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("backgnd ops:              \t%.3g s/call\t\t(sum:%.12g)\n",dur.count()/M,sum);
+  duration<double> dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("backgnd ops:              \t%.3g s/call\t\t(sum:%.12g)\n", dur.count() / M,
+         sum);
 
   sum = 0.0;
-  for (int pirange=0;pirange<2;++pirange) {
+  for (int pirange = 0; pirange < 2; ++pirange) {
     tbegin = system_clock::now();
-    for (int i=0;i<M;++i) {
-      FLT x = (FLT)(-10.0) + i*((FLT)20.0/N);
-      FLT z = RESCALE(x,N,pirange);
+    for (int i = 0; i < M; ++i) {
+      FLT x = (FLT)(-10.0) + i * ((FLT)20.0 / N);
+      FLT z = RESCALE(x, N, pirange);
       sum += z;
     }
-    dur = system_clock::now() - tbegin;   // dur.count() is sec
-    printf("w/ RESCALE macro (pir=%d):\t%.3g s/call\t\t(sum:%.12g)\n",pirange,dur.count()/M,sum);
+    dur = system_clock::now() - tbegin; // dur.count() is sec
+    printf("w/ RESCALE macro (pir=%d):\t%.3g s/call\t\t(sum:%.12g)\n", pirange,
+           dur.count() / M, sum);
   }
-  
+
   sum = 0.0;
-  for (int pirange=0;pirange<2;++pirange) {
+  for (int pirange = 0; pirange < 2; ++pirange) {
     tbegin = system_clock::now();
-    for (int i=0;i<M;++i) {
-      FLT x = (FLT)(-10.0) + i*((FLT)20.0/N);
-      FLT z = foldrescale(x,N,pirange);
+    for (int i = 0; i < M; ++i) {
+      FLT x = (FLT)(-10.0) + i * ((FLT)20.0 / N);
+      FLT z = foldrescale(x, N, pirange);
       sum += z;
     }
-    dur = system_clock::now() - tbegin;   // dur.count() is sec
-    printf("w/ foldrescale (pir=%d):  \t%.3g s/call\t\t(sum:%.12g)\n",pirange,dur.count()/M,sum);
+    dur = system_clock::now() - tbegin; // dur.count() is sec
+    printf("w/ foldrescale (pir=%d):  \t%.3g s/call\t\t(sum:%.12g)\n", pirange,
+           dur.count() / M, sum);
   }
-  
+
   return 0;
 }
diff --git a/devel/foldrescale_perf2.cpp b/devel/foldrescale_perf2.cpp
index b57eb2746..7088d0e0b 100644
--- a/devel/foldrescale_perf2.cpp
+++ b/devel/foldrescale_perf2.cpp
@@ -4,8 +4,9 @@
 
    Compile with, eg on linux, double-prec:
 
-   g++ -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp
-   g++ -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only
+   g++ -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o
+foldrescale_perf2 -lgomp g++ -O3 -funroll-loops -march=native -I../include -fopenmp
+foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only
 
    Flags: -DSINGLE for single-prec. OMP only used for random # gen.
           -DNOBIN to skip the binning, leaving just fold&rescale.
@@ -24,22 +25,21 @@ BETTER i7 GCC9 RESULTS:  (run ./foldrescale.sh)
 
 BINNING (closer to spreadinterp application):
 
-alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp
-alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
-simple array sum:           	1.9 ns/call	(sum:540.8833119415621)
-simple bin over [-3pi,3pi):  	1.1 ns/call	(ans:100667)
-w/ RESCALE1 macro:       	4.3 ns/call	(sum:499894508.4253364)
-w/ RESCALE macro (pir=0):	6.7 ns/call	(sum:499894508.4253364)
-w/ RESCALE macro (pir=1):	4.5 ns/call	(sum:499894508.4253364)
-w/ foldrescale1:           	8.3 ns/call	(sum:499894508.4253364)
-w/ foldrescale2:           	7.0 ns/call	(sum:499894508.4253364)
-w/ foldrescale3:           	7.0 ns/call	(sum:499894508.4253364)
-w/ foldrescale (pir=0):  	6.7 ns/call	(sum:499894508.4253364)
-w/ foldrescale (pir=1):  	8.2 ns/call	(sum:499894508.4253364)
-						(ans:905754)
-
-alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only
-alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
+alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native
+-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp alex@fiona
+/home/alex/numerics/finufft/devel> ./foldrescale_perf2 simple array sum:           	1.9
+ns/call	(sum:540.8833119415621) simple bin over [-3pi,3pi):  	1.1 ns/call	(ans:100667) w/
+RESCALE1 macro:       	4.3 ns/call	(sum:499894508.4253364) w/ RESCALE macro (pir=0):	6.7
+ns/call	(sum:499894508.4253364) w/ RESCALE macro (pir=1):	4.5 ns/call
+(sum:499894508.4253364) w/ foldrescale1:           	8.3 ns/call	(sum:499894508.4253364) w/
+foldrescale2:           	7.0 ns/call	(sum:499894508.4253364) w/
+foldrescale3:           	7.0 ns/call	(sum:499894508.4253364) w/ foldrescale
+(pir=0):  	6.7 ns/call	(sum:499894508.4253364) w/ foldrescale (pir=1):  	8.2 ns/call
+(sum:499894508.4253364) (ans:905754)
+
+alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native
+-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast
+-fno-finite-math-only alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
 simple array sum:           	0.4 ns/call	(sum:-9554.451222028649)
 simple bin over [-3pi,3pi):  	1.5 ns/call	(ans:100815)
 w/ RESCALE1 macro:       	2.0 ns/call	(sum:499919136.1859143)
@@ -50,35 +50,31 @@ w/ foldrescale2:           	6.7 ns/call	(sum:499919136.1859144)
 w/ foldrescale3:           	7.0 ns/call	(sum:499919136.1859144)
 w/ foldrescale (pir=0):  	6.4 ns/call	(sum:499919136.1859144)
 w/ foldrescale (pir=1):  	8.1 ns/call	(sum:499919136.1859143)
-						(ans:904913)
+            (ans:904913)
 NOBIN:
 
-alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -DNOBIN
-alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
-simple array sum:           	1.3 ns/call	(sum:-5028.023988434961)
-w/ RESCALE1 macro:       	1.3 ns/call	(sum:499984776.5128576)
-w/ RESCALE macro (pir=0):	6.4 ns/call	(sum:499984776.5128576)
-w/ RESCALE macro (pir=1):	1.4 ns/call	(sum:499984776.5128576)
-w/ foldrescale1:           	7.8 ns/call	(sum:499984776.5128576)
-w/ foldrescale2:           	6.2 ns/call	(sum:499984776.5128576)
-w/ foldrescale3:           	6.4 ns/call	(sum:499984776.5128576)
-w/ foldrescale (pir=0):  	6.3 ns/call	(sum:499984776.5128576)
-w/ foldrescale (pir=1):  	8.2 ns/call	(sum:499984776.5128576)
-						(ans:0)
-
-alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native -I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast -fno-finite-math-only -DNOBIN
-alex@fiona /home/alex/numerics/finufft/devel> ./foldrescale_perf2
-simple array sum:           	0.4 ns/call	(sum:-14573.38274652959)
-w/ RESCALE1 macro:       	0.7 ns/call	(sum:499926457.4098142)
-w/ RESCALE macro (pir=0):	0.7 ns/call	(sum:499926457.4098142)
-w/ RESCALE macro (pir=1):	0.8 ns/call	(sum:499926457.4098142)
-w/ foldrescale1:           	1.0 ns/call	(sum:499926457.4098143)
-w/ foldrescale2:           	0.8 ns/call	(sum:499926457.4098142)
-w/ foldrescale3:           	0.8 ns/call	(sum:499926457.4098142)
-w/ foldrescale (pir=0):  	0.9 ns/call	(sum:499926457.4098143)
-w/ foldrescale (pir=1):  	1.0 ns/call	(sum:499926457.4098144)
-						(ans:0)
-Concl:
+alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native
+-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -DNOBIN alex@fiona
+/home/alex/numerics/finufft/devel> ./foldrescale_perf2 simple array sum:           	1.3
+ns/call	(sum:-5028.023988434961) w/ RESCALE1 macro:       	1.3 ns/call
+(sum:499984776.5128576) w/ RESCALE macro (pir=0):	6.4 ns/call	(sum:499984776.5128576) w/
+RESCALE macro (pir=1):	1.4 ns/call	(sum:499984776.5128576) w/
+foldrescale1:           	7.8 ns/call	(sum:499984776.5128576) w/
+foldrescale2:           	6.2 ns/call	(sum:499984776.5128576) w/
+foldrescale3:           	6.4 ns/call	(sum:499984776.5128576) w/ foldrescale
+(pir=0):  	6.3 ns/call	(sum:499984776.5128576) w/ foldrescale (pir=1):  	8.2 ns/call
+(sum:499984776.5128576) (ans:0)
+
+alex@fiona /home/alex/numerics/finufft/devel> g++-9 -O3 -funroll-loops -march=native
+-I../include -fopenmp foldrescale_perf2.cpp -o foldrescale_perf2 -lgomp -Ofast
+-fno-finite-math-only -DNOBIN alex@fiona /home/alex/numerics/finufft/devel>
+./foldrescale_perf2 simple array sum:           	0.4 ns/call	(sum:-14573.38274652959) w/
+RESCALE1 macro:       	0.7 ns/call	(sum:499926457.4098142) w/ RESCALE macro (pir=0):	0.7
+ns/call	(sum:499926457.4098142) w/ RESCALE macro (pir=1):	0.8 ns/call
+(sum:499926457.4098142) w/ foldrescale1:           	1.0 ns/call	(sum:499926457.4098143) w/
+foldrescale2:           	0.8 ns/call	(sum:499926457.4098142) w/ foldrescale3: 0.8 ns/call
+(sum:499926457.4098142) w/ foldrescale (pir=0):  	0.9 ns/call	(sum:499926457.4098143) w/
+foldrescale (pir=1):  	1.0 ns/call	(sum:499926457.4098144) (ans:0) Concl:
 * foldrescale FUNCTION is only fast when Ofast & NOBIN, really weird.
 * macro *is* faster than function, even modern g++.
 * RESCALE is same as RESCALE1
@@ -118,32 +114,34 @@ can recover isnan handling with -Ofast -fno-finite-math-only     .. good!
 #include "finufft/defs.h"
 
 #include <math.h>
+#include <omp.h>
 #include <stdio.h>
 #include <vector>
-#include <omp.h>
 // let's try the "modern" C++ way to time... yuk...
 #include <chrono>
 using namespace std::chrono;
 
-
 // old coord-handling macro ------------------------------------------------
-//#define RESCALE(x,N,p) (p ?                                           \
-//                        (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \
+// #define RESCALE(x,N,p) (p ?                                           \
+//                        (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5 :
+//                        (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5 : (FLT)0.5))*N) : \
 //                        (x<(FLT)0.0 ? x+(FLT)N : (x>(FLT)N ? x-(FLT)N : x)))
 // casting makes no difference
 
 // cleaner rewrite, no slower:
-#define RESCALE(x,N,p) (p ?                                             \
-         (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI)) * ((FLT)M_1_2PI*N) : \
-                        (x>=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N))
+#define RESCALE(x, N, p)                                                    \
+  (p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N) \
+     : (x >= 0.0 ? (x < (FLT)N ? x : x - (FLT)N) : x + (FLT)N))
 
 // pirange=1 fixed ver of old coord-handling macro ------------------------
-//#define RESCALE1(x,N) (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5*N : (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5*N : (FLT)0.5*N)))
+// #define RESCALE1(x,N) (x*(FLT)M_1_2PI*N + (x*(FLT)M_1_2PI*N<-N/(FLT)2.0 ? (FLT)1.5*N :
+// (x*(FLT)M_1_2PI*N>N/(FLT)2.0 ? (FLT)-0.5*N : (FLT)0.5*N)))
 // it does matter how written: this made faster...
-//#define RESCALE1(x,N) (x*(FLT)M_1_2PI + (x*(FLT)M_1_2PI<-0.5 ? 1.5 : (x*(FLT)M_1_2PI>0.5 ? -0.5 : 0.5)))*N
-
-#define RESCALE1(x,N) (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI))*((FLT)M_1_2PI*N)
+// #define RESCALE1(x,N) (x*(FLT)M_1_2PI + (x*(FLT)M_1_2PI<-0.5 ? 1.5 :
+// (x*(FLT)M_1_2PI>0.5 ? -0.5 : 0.5)))*N
 
+#define RESCALE1(x, N) \
+  (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT)M_1_2PI * N)
 
 // function equivalents -----------------------------------------------------
 static inline FLT foldrescale(FLT x, BIGINT N, int pirange)
@@ -153,184 +151,199 @@ static inline FLT foldrescale(FLT x, BIGINT N, int pirange)
 {
   // affine rescale...
   FLT z = x;
-  if (pirange)
-    z = (N/(2*PI)) * (x+PI);                  // PI is (FLT)M_PI in defs.h
+  if (pirange) z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h
   // fold...
-  if (z<(FLT)0.0)
+  if (z < (FLT)0.0)
     z += (FLT)N;
-  else if (z>=(FLT)N)
+  else if (z >= (FLT)N)
     z -= (FLT)N;
   return z;
-} 
+}
 
 static inline FLT foldrescale1(FLT x, BIGINT N)
 // same as above but hardwired pirange=1. rescale then fold
 {
   // affine rescale always...
-  FLT z = (N/(2*PI)) * (x+PI);                  // PI is (FLT)M_PI in defs.h
+  FLT z = (N / (2 * PI)) * (x + PI); // PI is (FLT)M_PI in defs.h
   // fold...
-  if (z<(FLT)0.0)
+  if (z < (FLT)0.0)
     z += (FLT)N;
-  else if (z>=(FLT)N)
+  else if (z >= (FLT)N)
     z -= (FLT)N;
   return z;
-} 
+}
 
 static inline FLT foldrescale2(FLT x, BIGINT N)
 // same as above but hardwired pirange=1, flip so fold done before rescale
 {
-  if (x<-PI)
-    x += 2*PI;
-  else if (x>PI)
-    x -= 2*PI;
-  return (N/(2*PI)) * (x+PI);
-} 
+  if (x < -PI)
+    x += 2 * PI;
+  else if (x > PI)
+    x -= 2 * PI;
+  return (N / (2 * PI)) * (x + PI);
+}
 
 static inline FLT foldrescale3(FLT x, BIGINT N)
 // same as above but hardwired pirange=1, flip so fold done before rescale
 {
-  if (x<-PI)
-    x += 3*PI;
-  else if (x>PI)
+  if (x < -PI)
+    x += 3 * PI;
+  else if (x > PI)
     x -= PI;
   else
     x += PI;
-  return (N/(2*PI)) * x;
+  return (N / (2 * PI)) * x;
 }
 
-
-
 // ==========================================================================
-int main(int argc, char* argv[])
-{
-  int M=10000000;                 // default: # pts to test (>=1e7 is acc)
-  int N = 100;                    // grid size, matters that unknown @ compile
-  
-  if (argc>1) { double w; sscanf(argv[1],"%lf",&w); M = (int)w; }
-  if (argc>2) { double w; sscanf(argv[2],"%lf",&w); N = (int)w; }  
-  std::vector<int> c(N,0);        // let's do basic binning while we're at it
-                                  // to prevent compiler optims
-  int maxc=0;                     // use for max bin count
-   
+int main(int argc, char *argv[]) {
+  int M = 10000000; // default: # pts to test (>=1e7 is acc)
+  int N = 100;      // grid size, matters that unknown @ compile
+
+  if (argc > 1) {
+    double w;
+    sscanf(argv[1], "%lf", &w);
+    M = (int)w;
+  }
+  if (argc > 2) {
+    double w;
+    sscanf(argv[2], "%lf", &w);
+    N = (int)w;
+  }
+  std::vector<int> c(N, 0); // let's do basic binning while we're at it
+                            // to prevent compiler optims
+  int maxc = 0;             // use for max bin count
+
   // fill array w/ random #s (in par), deterministic seeds based on threads
   std::vector<FLT> x(M);
 #pragma omp parallel
   {
-    unsigned int s=omp_get_thread_num();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,1000000)
-    for (int i=0; i<M; ++i)
-      x[i] = 3.0*PI*randm11r(&s);          // unif over the folded domain
+    unsigned int s = omp_get_thread_num(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, 1000000)
+    for (int i = 0; i < M; ++i)
+      x[i] = 3.0 * PI * randm11r(&s); // unif over the folded domain
   }
   // (note when pirange=0 the conditional <0 vs >=0 still 1:2 random)
   // We'll reuse this array by rescaling/unrescaling by hand.
-  
-  FLT sum=0.0;
+
+  FLT sum     = 0.0;
   auto tbegin = system_clock::now();
-  for (int i=0;i<M;++i)
-    sum += x[i];                          // simply sweep through array
-  duration<double> dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("simple array sum:           \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  for (int i = 0; i < M; ++i) sum += x[i];             // simply sweep through array
+  duration<double> dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("simple array sum:           \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
 #ifndef NOBIN
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    int b = (int)(N*((1.0/(6*PI))*x[i] + (FLT)0.5));   // in {0,..,N-1}
+  for (int i = 0; i < M; ++i) {
+    int b = (int)(N * ((1.0 / (6 * PI)) * x[i] + (FLT)0.5)); // in {0,..,N-1}
     ++c[b];
-    //if (b<0 || b>=N) printf("b[%d]=%d (x=%.16g, flt b=%.16g)\n",i,b,x[i],N*((1.0/(6*PI))*x[i] + 0.5));  // chk all indices ok!
+    // if (b<0 || b>=N) printf("b[%d]=%d (x=%.16g, flt
+    // b=%.16g)\n",i,b,x[i],N*((1.0/(6*PI))*x[i] + 0.5));  // chk all indices ok!
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  for(int b=0;b<N;++b) if (c[b]>maxc) maxc=c[b];   // somehow use it
-  printf("simple bin over [-3pi,3pi):  \t%.1f ns/call\t(ans:%d)\n",1e9*dur.count()/(double)M,maxc);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  for (int b = 0; b < N; ++b)
+    if (c[b] > maxc) maxc = c[b];     // somehow use it
+  printf("simple bin over [-3pi,3pi):  \t%.1f ns/call\t(ans:%d)\n",
+         1e9 * dur.count() / (double)M, maxc);
 #endif
-  
-  sum = 0.0;    // hardwired pirange=1 MACRO.......................
+
+  sum    = 0.0; // hardwired pirange=1 MACRO.......................
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    FLT z = RESCALE1(x[i],N);
+  for (int i = 0; i < M; ++i) {
+    FLT z = RESCALE1(x[i], N);
     sum += z;
 #ifndef NOBIN
-    ++c[(int)z];       // bin it
+    ++c[(int)z]; // bin it
 #endif
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("w/ RESCALE1 macro:       \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("w/ RESCALE1 macro:       \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
-  for (int pirange=0;pirange<2;++pirange) {
+  for (int pirange = 0; pirange < 2; ++pirange) {
     if (!pirange)
-      for (int i=0;i<M;++i) x[i] = (N/(2*PI)) * (x[i]+PI);   // rescale to [0,N)
-    //FLT mx=0.0; for (int i=0;i<M;++i) if (x[i]>mx) mx=x[i];   // chk max
-    //printf("max x=%.3g\n",mx);
-    sum = 0.0;
+      for (int i = 0; i < M; ++i) x[i] = (N / (2 * PI)) * (x[i] + PI); // rescale to [0,N)
+    // FLT mx=0.0; for (int i=0;i<M;++i) if (x[i]>mx) mx=x[i];   // chk max
+    // printf("max x=%.3g\n",mx);
+    sum    = 0.0;
     tbegin = system_clock::now();
-    for (int i=0;i<M;++i) {
-      FLT z = RESCALE(x[i],N,pirange);
+    for (int i = 0; i < M; ++i) {
+      FLT z = RESCALE(x[i], N, pirange);
       sum += z;
 #ifndef NOBIN
-      ++c[(int)z];       // bin it
+      ++c[(int)z]; // bin it
 #endif
     }
-    dur = system_clock::now() - tbegin;   // dur.count() is sec
-    printf("w/ RESCALE macro (pir=%d):\t%.1f ns/call\t(sum:%.16g)\n",pirange,1e9*dur.count()/(double)M,sum);
+    dur = system_clock::now() - tbegin; // dur.count() is sec
+    printf("w/ RESCALE macro (pir=%d):\t%.1f ns/call\t(sum:%.16g)\n", pirange,
+           1e9 * dur.count() / (double)M, sum);
     if (!pirange)
-      for (int i=0;i<M;++i) x[i] = x[i]*((2*PI)/N) - PI;   // undo rescale
+      for (int i = 0; i < M; ++i) x[i] = x[i] * ((2 * PI) / N) - PI; // undo rescale
   }
-  
-  sum = 0.0;    // hardwired pirange=1 FUNC.......................
+
+  sum    = 0.0; // hardwired pirange=1 FUNC.......................
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    FLT z = foldrescale1(x[i],N);
+  for (int i = 0; i < M; ++i) {
+    FLT z = foldrescale1(x[i], N);
     sum += z;
 #ifndef NOBIN
-    ++c[(int)z];       // bin it
+    ++c[(int)z]; // bin it
 #endif
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("w/ foldrescale1:           \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("w/ foldrescale1:           \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
-  sum = 0.0;    // hardwired pirange=1 FUNC.......................
+  sum    = 0.0; // hardwired pirange=1 FUNC.......................
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    FLT z = foldrescale2(x[i],N);
+  for (int i = 0; i < M; ++i) {
+    FLT z = foldrescale2(x[i], N);
     sum += z;
 #ifndef NOBIN
-    ++c[(int)z];       // bin it
+    ++c[(int)z]; // bin it
 #endif
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("w/ foldrescale2:           \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("w/ foldrescale2:           \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
-  sum = 0.0;    // hardwired pirange=1 FUNC.......................
+  sum    = 0.0; // hardwired pirange=1 FUNC.......................
   tbegin = system_clock::now();
-  for (int i=0;i<M;++i) {
-    FLT z = foldrescale3(x[i],N);
+  for (int i = 0; i < M; ++i) {
+    FLT z = foldrescale3(x[i], N);
     sum += z;
 #ifndef NOBIN
-    ++c[(int)z];       // bin it
+    ++c[(int)z]; // bin it
 #endif
   }
-  dur = system_clock::now() - tbegin;   // dur.count() is sec
-  printf("w/ foldrescale3:           \t%.1f ns/call\t(sum:%.16g)\n",1e9*dur.count()/(double)M,sum);
+  dur = system_clock::now() - tbegin; // dur.count() is sec
+  printf("w/ foldrescale3:           \t%.1f ns/call\t(sum:%.16g)\n",
+         1e9 * dur.count() / (double)M, sum);
 
-  for (int pirange=0;pirange<2;++pirange) {
+  for (int pirange = 0; pirange < 2; ++pirange) {
     if (!pirange)
-      for (int i=0;i<M;++i) x[i] = (N/(2*PI)) * (x[i]+PI);   // rescale to [0,N)
-    sum = 0.0;
+      for (int i = 0; i < M; ++i) x[i] = (N / (2 * PI)) * (x[i] + PI); // rescale to [0,N)
+    sum    = 0.0;
     tbegin = system_clock::now();
-    for (int i=0;i<M;++i) {
-      FLT z = foldrescale(x[i],N,pirange);
+    for (int i = 0; i < M; ++i) {
+      FLT z = foldrescale(x[i], N, pirange);
       sum += z;
 #ifndef NOBIN
-      ++c[(int)z];       // bin it
+      ++c[(int)z]; // bin it
 #endif
     }
-    dur = system_clock::now() - tbegin;   // dur.count() is sec
-    printf("w/ foldrescale (pir=%d):  \t%.1f ns/call\t(sum:%.16g)\n",pirange,1e9*dur.count()/(double)M,sum);
+    dur = system_clock::now() - tbegin; // dur.count() is sec
+    printf("w/ foldrescale (pir=%d):  \t%.1f ns/call\t(sum:%.16g)\n", pirange,
+           1e9 * dur.count() / (double)M, sum);
     if (!pirange)
-      for (int i=0;i<M;++i) x[i] = x[i]*((2*PI)/N) - PI;   // undo rescale
+      for (int i = 0; i < M; ++i) x[i] = x[i] * ((2 * PI) / N) - PI; // undo rescale
   }
 
   // force it to not optimize away the bin filling steps:
-  maxc=0; for(int b=0;b<N;++b) if (c[b]>maxc) maxc=c[b];   // somehow use it
-  printf("\t\t\t\t\t\t(ans:%d)\n",maxc);
+  maxc = 0;
+  for (int b = 0; b < N; ++b)
+    if (c[b] > maxc) maxc = c[b]; // somehow use it
+  printf("\t\t\t\t\t\t(ans:%d)\n", maxc);
   return 0;
 }
diff --git a/devel/interp_square_nowrap.cpp b/devel/interp_square_nowrap.cpp
index d17b32a89..8cd3758b5 100644
--- a/devel/interp_square_nowrap.cpp
+++ b/devel/interp_square_nowrap.cpp
@@ -1,29 +1,31 @@
 // this is code I was messing with timing using time2d2interp.cpp
 // around May 3, 2018, to figure how wrapping was slowing down spreading.
 
-void interp_square_nowrap(FLT *out,FLT *du, FLT *ker1, FLT *ker2, BIGINT i1,BIGINT i2,BIGINT N1,BIGINT N2,int ns)
+void interp_square_nowrap(FLT *out, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2,
+                          BIGINT N1, BIGINT N2, int ns)
 // *************** don't periodic wrap, avoid ptrs. correct if no NU pts nr edge
 {
-  out[0] = 0.0; out[1] = 0.0;
-  if (0) {  // plain
-    for (int dy=0; dy<ns; dy++) {
-      BIGINT j = N1*(i2+dy) + i1;
-      for (int dx=0; dx<ns; dx++) {
-	FLT k = ker1[dx]*ker2[dy];
-	out[0] += du[2*j] * k;
-	out[1] += du[2*j+1] * k;
-	++j;
+  out[0] = 0.0;
+  out[1] = 0.0;
+  if (0) { // plain
+    for (int dy = 0; dy < ns; dy++) {
+      BIGINT j = N1 * (i2 + dy) + i1;
+      for (int dx = 0; dx < ns; dx++) {
+        FLT k = ker1[dx] * ker2[dy];
+        out[0] += du[2 * j] * k;
+        out[1] += du[2 * j + 1] * k;
+        ++j;
       }
     }
   } else {
-   for (int dy=0; dy<ns; dy++) {
-      BIGINT j = N1*(i2+dy) + i1;
-      //#pragma omp simd
-      for (int dx=0; dx<ns; dx++) {
-	FLT k = ker1[dx]*ker2[dy];
-	out[0] += du[2*j] * k;
-	out[1] += du[2*j+1] * k;
-	++j;
+    for (int dy = 0; dy < ns; dy++) {
+      BIGINT j = N1 * (i2 + dy) + i1;
+      // #pragma omp simd
+      for (int dx = 0; dx < ns; dx++) {
+        FLT k = ker1[dx] * ker2[dy];
+        out[0] += du[2 * j] * k;
+        out[1] += du[2 * j + 1] * k;
+        ++j;
       }
     }
   }
diff --git a/devel/ker_horner_allw.c b/devel/ker_horner_allw.c
index 804e98721..a0adb232e 100644
--- a/devel/ker_horner_allw.c
+++ b/devel/ker_horner_allw.c
@@ -1,153 +1,2067 @@
-// Code generated by gen_all_horner_C_code.m in finufft/devel\n// Author: Alex Barnett.  (C) 2018, The Simons Foundation, Inc.\n
-if (w==2) {
-    ker[0] = 4.5147043243215315E+01 + z*(5.7408070938221300E+01 + z*(-1.8395117920046484E+00 + z*(-2.0382426253182082E+01 + z*(-2.0940804433577420E+00 + z*(3.1328044596872568E+00)))));
-    ker[1] = 4.5147043243215300E+01 + z*(-5.7408070938221293E+01 + z*(-1.8395117920046560E+00 + z*(2.0382426253182086E+01 + z*(-2.0940804433577389E+00 + z*(-3.1328044596872520E+00)))));
-  } else if (w==3) {
-    ker[0] = 1.5653991189315119E+02 + z*(3.1653018869611077E+02 + z*(1.7742692790454484E+02 + z*(-1.5357716116473156E+01 + z*(-3.7757583061523668E+01 + z*(-3.9654011076088804E+00 + z*(3.3694352031960215E+00))))));
-    ker[1] = 8.8006872410780295E+02 + z*(7.4325702843759617E-14 + z*(-3.3149255274727801E+02 + z*(9.5071486252033243E-15 + z*(5.3222970968867315E+01 + z*(1.8062124448285358E-13 + z*(-4.8817394017825064E+00))))));
-    ker[2] = 1.5653991189967152E+02 + z*(-3.1653018868907071E+02 + z*(1.7742692791117119E+02 + z*(1.5357716122720193E+01 + z*(-3.7757583054647384E+01 + z*(3.9654011139270540E+00 + z*(3.3694352094301756E+00))))));
-  } else if (w==4) {
-    ker[0] = 5.4284366850213200E+02 + z*(1.4650917259256939E+03 + z*(1.4186910680718345E+03 + z*(5.1133995502497419E+02 + z*(-4.8293622641174039E+01 + z*(-7.8386867802392288E+01 + z*(-1.0039212571700894E+01 + z*(4.7282853097647548E+00)))))));
-    ker[1] = 1.0073871433088398E+04 + z*(6.1905285583602863E+03 + z*(-1.3995339862725591E+03 + z*(-1.4191608683682996E+03 + z*(3.9393732546135226E+01 + z*(1.4918904800408930E+02 + z*(5.0626747735616746E+00 + z*(-9.5966330409194107E+00)))))));
-    ker[2] = 1.0073871433088396E+04 + z*(-6.1905285583602881E+03 + z*(-1.3995339862725598E+03 + z*(1.4191608683682998E+03 + z*(3.9393732546135816E+01 + z*(-1.4918904800408751E+02 + z*(5.0626747735625512E+00 + z*(9.5966330409192029E+00)))))));
-    ker[3] = 5.4284366850213223E+02 + z*(-1.4650917259256937E+03 + z*(1.4186910680718347E+03 + z*(-5.1133995502497424E+02 + z*(-4.8293622641174061E+01 + z*(7.8386867802392359E+01 + z*(-1.0039212571700640E+01 + z*(-4.7282853097647095E+00)))))));
-  } else if (w==5) {
-    ker[0] = 9.9223677575398392E+02 + z*(3.0430174925083825E+03 + z*(3.6092689177271222E+03 + z*(1.9990077310495396E+03 + z*(4.0071733590403869E+02 + z*(-9.1301168206167262E+01 + z*(-5.5339722671223846E+01 + z*(-3.3762488150353924E+00 + z*(2.5183531846828431E+00))))))));
-    ker[1] = 3.7794697666613320E+04 + z*(3.7938404259811403E+04 + z*(7.7501368899498666E+03 + z*(-3.8875294641277296E+03 + z*(-1.5861137916762602E+03 + z*(1.2316471075214675E+02 + z*(1.1960590540261879E+02 + z*(2.2839981872948751E+00 + z*(-5.3664382310917826E+00))))))));
-    ker[2] = 9.8715771010760494E+04 + z*(-1.1842989705877139E-11 + z*(-2.2704627332475000E+04 + z*(9.7116927320010791E-12 + z*(2.3839858699098645E+03 + z*(2.0698495299948402E-11 + z*(-1.5249941358311668E+02 + z*(7.1884725699454154E-12 + z*(6.6969190369423464E+00))))))));
-    ker[3] = 3.7794697666613283E+04 + z*(-3.7938404259811381E+04 + z*(7.7501368899498730E+03 + z*(3.8875294641277369E+03 + z*(-1.5861137916762643E+03 + z*(-1.2316471075214508E+02 + z*(1.1960590540262307E+02 + z*(-2.2839981872943818E+00 + z*(-5.3664382311089387E+00))))))));
-    ker[4] = 9.9223677575398403E+02 + z*(-3.0430174925083829E+03 + z*(3.6092689177271218E+03 + z*(-1.9990077310495412E+03 + z*(4.0071733590403909E+02 + z*(9.1301168206167233E+01 + z*(-5.5339722671223605E+01 + z*(3.3762488150341459E+00 + z*(2.5183531846825233E+00))))))));
-  } else if (w==6) {
-    ker[0] = 2.0553833234911876E+03 + z*(7.1269776034442639E+03 + z*(1.0023404568475091E+04 + z*(7.2536109410387417E+03 + z*(2.7021878300949752E+03 + z*(3.2120291706547636E+02 + z*(-1.2051267090537374E+02 + z*(-4.5977202613350237E+01 + z*(-1.5631081288842275E+00 + z*(1.7872002109942835E+00)))))))));
-    ker[1] = 1.5499537739913128E+05 + z*(2.0581923258843314E+05 + z*(9.0916650498360192E+04 + z*(4.8347162752602981E+03 + z*(-7.8773465553972646E+03 + z*(-1.8229189469936762E+03 + z*(2.2400507411399673E+02 + z*(1.1536880606853076E+02 + z*(7.1037430591266115E-01 + z*(-4.0452381056342732E+00)))))))));
-    ker[2] = 8.1177907023291115E+05 + z*(3.1559612614917674E+05 + z*(-1.0095927514054619E+05 + z*(-5.0512736602018522E+04 + z*(5.2105876478342780E+03 + z*(3.7928113414429808E+03 + z*(-1.2506575852541796E+02 + z*(-1.7819720186493959E+02 + z*(-6.9838401121429056E-02 + z*(5.8969107681874027E+00)))))))));
-    ker[3] = 8.1177907023291173E+05 + z*(-3.1559612614917627E+05 + z*(-1.0095927514054628E+05 + z*(5.0512736602018478E+04 + z*(5.2105876478343343E+03 + z*(-3.7928113414427025E+03 + z*(-1.2506575852521925E+02 + z*(1.7819720186497622E+02 + z*(-6.9838401186476856E-02 + z*(-5.8969107680596977E+00)))))))));
-    ker[4] = 1.5499537739913136E+05 + z*(-2.0581923258843317E+05 + z*(9.0916650498360177E+04 + z*(-4.8347162752603008E+03 + z*(-7.8773465553972710E+03 + z*(1.8229189469937312E+03 + z*(2.2400507411398695E+02 + z*(-1.1536880606854736E+02 + z*(7.1037430589285400E-01 + z*(4.0452381056603945E+00)))))))));
-    ker[5] = 2.0553833235005691E+03 + z*(-7.1269776034341394E+03 + z*(1.0023404568484635E+04 + z*(-7.2536109410297540E+03 + z*(2.7021878301048723E+03 + z*(-3.2120291705638243E+02 + z*(-1.2051267089640181E+02 + z*(4.5977202622148909E+01 + z*(-1.5631081203754575E+00 + z*(-1.7872002036966905E+00)))))))));
-  } else if (w==7) {
-    ker[0] = 3.9948351830487481E+03 + z*(1.5290160332974696E+04 + z*(2.4458227486779251E+04 + z*(2.1166189345881645E+04 + z*(1.0542795672344864E+04 + z*(2.7903491906228419E+03 + z*(1.6069721418053300E+02 + z*(-1.2289277373867256E+02 + z*(-3.2270164914249058E+01 + z*(-1.4761409685186277E-01 + z*(1.0330620799145493E+00))))))))));
-    ker[1] = 5.4715865608590771E+05 + z*(8.7628248584320408E+05 + z*(5.3904618484139396E+05 + z*(1.3382732160223130E+05 + z*(-7.0739172265098678E+03 + z*(-1.0975382873973093E+04 + z*(-1.5518707872251393E+03 + z*(2.8583630927743314E+02 + z*(9.1892112257581346E+01 + z*(-9.1862771280377487E-01 + z*(-2.6798144968400117E+00))))))))));
-    ker[2] = 5.0196413492771760E+06 + z*(3.4421061790934438E+06 + z*(2.4315566181017534E+05 + z*(-3.3113450969689694E+05 + z*(-6.5563293056049893E+04 + z*(1.3656979541144799E+04 + z*(4.3634273936642621E+03 + z*(-2.8318194617327981E+02 + z*(-1.6710678096334209E+02 + z*(1.2845147741777752E+00 + z*(4.4142511558139139E+00))))))))));
-    ker[3] = 9.8206709220713247E+06 + z*(-2.6908159596373561E-10 + z*(-1.6133959371974322E+06 + z*(6.9013724510092140E-10 + z*(1.2429734005960064E+05 + z*(7.7346408577822045E-10 + z*(-5.9891976420595174E+03 + z*(6.9043515551118249E-10 + z*(2.0317049305432383E+02 + z*(5.6547359492808854E-10 + z*(-5.1799254920720621E+00))))))))));
-    ker[4] = 5.0196413492771825E+06 + z*(-3.4421061790934461E+06 + z*(2.4315566181017453E+05 + z*(3.3113450969689724E+05 + z*(-6.5563293056049602E+04 + z*(-1.3656979541143772E+04 + z*(4.3634273936642730E+03 + z*(2.8318194617392436E+02 + z*(-1.6710678096383771E+02 + z*(-1.2845147728310689E+00 + z*(4.4142511545643943E+00))))))))));
-    ker[5] = 5.4715865608590783E+05 + z*(-8.7628248584320408E+05 + z*(5.3904618484139396E+05 + z*(-1.3382732160223136E+05 + z*(-7.0739172265098332E+03 + z*(1.0975382873973256E+04 + z*(-1.5518707872251064E+03 + z*(-2.8583630927760140E+02 + z*(9.1892112257416159E+01 + z*(9.1862771293147971E-01 + z*(-2.6798144967872908E+00))))))))));
-    ker[6] = 3.9948351830642519E+03 + z*(-1.5290160332958067E+04 + z*(2.4458227486795113E+04 + z*(-2.1166189345866893E+04 + z*(1.0542795672361213E+04 + z*(-2.7903491906078298E+03 + z*(1.6069721419533221E+02 + z*(1.2289277375319763E+02 + z*(-3.2270164900224913E+01 + z*(1.4761410890866353E-01 + z*(1.0330620914446063E+00))))))))));
-  } else if (w==8) {
-    ker[0] = 7.3898000697447915E+03 + z*(3.0719636811267599E+04 + z*(5.4488498478251728E+04 + z*(5.3926359802542116E+04 + z*(3.2444118016247590E+04 + z*(1.1864306345505294E+04 + z*(2.2812256770903232E+03 + z*(8.5503535636821422E+00 + z*(-1.0230637348345023E+02 + z*(-1.9200143062947848E+01 + z*(3.7894993760177598E-01 + z*(6.2056700181418578E-01)))))))))));
-    ker[1] = 1.7297637497600035E+06 + z*(3.1853145713323927E+06 + z*(2.4101183255475131E+06 + z*(9.0469037926849292E+05 + z*(1.3079802224392134E+05 + z*(-2.2700360645707988E+04 + z*(-1.1569135767377773E+04 + z*(-9.7513976461238224E+02 + z*(2.8246898554269114E+02 + z*(6.1692257626706223E+01 + z*(-1.7334408836731494E+00 + z*(-1.5765637811248883E+00)))))))))));
-    ker[2] = 2.5578341605285794E+07 + z*(2.3797981861403696E+07 + z*(6.4554051283428287E+06 + z*(-6.0897036277696118E+05 + z*(-5.8652889370129269E+05 + z*(-5.0713607251414309E+04 + z*(2.0942387020798891E+04 + z*(3.8242995179171526E+03 + z*(-3.8638201738139219E+02 + z*(-1.2981109187842989E+02 + z*(2.5271184057877303E+00 + z*(2.8789780707929218E+00)))))))))));
-    ker[3] = 8.4789650417103335E+07 + z*(2.4569731244678464E+07 + z*(-8.9200440393090546E+06 + z*(-3.0743852105799988E+06 + z*(4.2333306008151924E+05 + z*(1.8308704458211688E+05 + z*(-1.1661592834945191E+04 + z*(-6.9201295567267280E+03 + z*(1.9106407993320320E+02 + z*(1.8681284210471688E+02 + z*(-1.2600963971824484E+00 + z*(-3.8435871899601382E+00)))))))))));
-    ker[4] = 8.4789650417103350E+07 + z*(-2.4569731244678471E+07 + z*(-8.9200440393090583E+06 + z*(3.0743852105800058E+06 + z*(4.2333306008152053E+05 + z*(-1.8308704458210632E+05 + z*(-1.1661592834940149E+04 + z*(6.9201295567248662E+03 + z*(1.9106407993289886E+02 + z*(-1.8681284209654376E+02 + z*(-1.2600963917834651E+00 + z*(3.8435871898077045E+00)))))))))));
-    ker[5] = 2.5578341605285816E+07 + z*(-2.3797981861403704E+07 + z*(6.4554051283428324E+06 + z*(6.0897036277696711E+05 + z*(-5.8652889370128722E+05 + z*(5.0713607251413123E+04 + z*(2.0942387020801420E+04 + z*(-3.8242995179155446E+03 + z*(-3.8638201738492717E+02 + z*(1.2981109187880142E+02 + z*(2.5271184069685657E+00 + z*(-2.8789780683540003E+00)))))))))));
-    ker[6] = 1.7297637497600049E+06 + z*(-3.1853145713323941E+06 + z*(2.4101183255475126E+06 + z*(-9.0469037926849339E+05 + z*(1.3079802224392109E+05 + z*(2.2700360645707628E+04 + z*(-1.1569135767377924E+04 + z*(9.7513976461209836E+02 + z*(2.8246898554219217E+02 + z*(-6.1692257626845532E+01 + z*(-1.7334408840526812E+00 + z*(1.5765637814488063E+00)))))))))));
-    ker[7] = 7.3898000697447915E+03 + z*(-3.0719636811267606E+04 + z*(5.4488498478251728E+04 + z*(-5.3926359802542138E+04 + z*(3.2444118016247590E+04 + z*(-1.1864306345505294E+04 + z*(2.2812256770903286E+03 + z*(-8.5503535637013552E+00 + z*(-1.0230637348345138E+02 + z*(1.9200143062947120E+01 + z*(3.7894993760636758E-01 + z*(-6.2056700182490354E-01)))))))))));
-  } else if (w==9) {
-    ker[0] = 1.3136365370186100E+04 + z*(5.8623313038274340E+04 + z*(1.1335001341875963E+05 + z*(1.2489113703229747E+05 + z*(8.6425493435991244E+04 + z*(3.8657354724013814E+04 + z*(1.0779131453134638E+04 + z*(1.4992527030548456E+03 + z*(-7.9857427421129714E+01 + z*(-7.1572272057937070E+01 + z*(-9.8886360698074700E+00 + z*(5.4050464454251190E-01)))))))))));
-    ker[1] = 5.0196413492771806E+06 + z*(1.0326318537280345E+07 + z*(9.0726133144784812E+06 + z*(4.3035547171861930E+06 + z*(1.0891182836653308E+06 + z*(7.9936390113331305E+04 + z*(-3.3466718311300596E+04 + z*(-9.7024371533891372E+03 + z*(-4.0585588534807385E+02 + z*(2.2785637019511205E+02 + z*(3.5359026949867051E+01 + z*(-1.7215219046118406E+00)))))))))));
-    ker[2] = 1.1303327711722563E+08 + z*(1.2898448324824864E+08 + z*(5.3501544534038112E+07 + z*(6.3021978510598792E+06 + z*(-2.0713033564200639E+06 + z*(-7.0458265546791907E+05 + z*(-1.3245366619006139E+04 + z*(2.3216330734057381E+04 + z*(2.6054813773472697E+03 + z*(-3.9109820765665262E+02 + z*(-8.5251867715709949E+01 + z*(2.8631741366044747E+00)))))))))));
-    ker[3] = 5.8225443924996686E+08 + z*(3.0522863709830385E+08 + z*(-2.6789524644146336E+05 + z*(-2.6014941986659057E+07 + z*(-2.8994941183506218E+06 + z*(1.0151095605715880E+06 + z*(1.8238470515353698E+05 + z*(-2.3465262819040818E+04 + z*(-6.1806593581075495E+03 + z*(3.3597424711470910E+02 + z*(1.4285748012617628E+02 + z*(-2.3817977532177874E+00)))))))))));
-    ker[4] = 9.7700272582690656E+08 + z*(-3.9398045056223735E-08 + z*(-1.2483923718899371E+08 + z*(6.0417403157325170E-08 + z*(7.5905338661205899E+06 + z*(1.2138090419648379E-07 + z*(-2.9285656292977190E+05 + z*(5.3299736484284360E-08 + z*(8.0679596874001718E+03 + z*(1.0596763818009852E-07 + z*(-1.6935269668779691E+02 + z*(3.3486663946780986E-08)))))))))));
-    ker[5] = 5.8225443924996758E+08 + z*(-3.0522863709830391E+08 + z*(-2.6789524644172983E+05 + z*(2.6014941986659389E+07 + z*(-2.8994941183505375E+06 + z*(-1.0151095605717725E+06 + z*(1.8238470515350526E+05 + z*(2.3465262819251962E+04 + z*(-6.1806593581869265E+03 + z*(-3.3597424723359080E+02 + z*(1.4285748010331625E+02 + z*(2.3817977739486871E+00)))))))))));
-    ker[6] = 1.1303327711722568E+08 + z*(-1.2898448324824864E+08 + z*(5.3501544534038112E+07 + z*(-6.3021978510598652E+06 + z*(-2.0713033564200667E+06 + z*(7.0458265546794771E+05 + z*(-1.3245366619000662E+04 + z*(-2.3216330734049119E+04 + z*(2.6054813773147021E+03 + z*(3.9109820766854079E+02 + z*(-8.5251867711661305E+01 + z*(-2.8631741207515398E+00)))))))))));
-    ker[7] = 5.0196413492772207E+06 + z*(-1.0326318537280388E+07 + z*(9.0726133144785129E+06 + z*(-4.3035547171862079E+06 + z*(1.0891182836653353E+06 + z*(-7.9936390113331567E+04 + z*(-3.3466718311299621E+04 + z*(9.7024371533890644E+03 + z*(-4.0585588535363172E+02 + z*(-2.2785637019009673E+02 + z*(3.5359026944299828E+01 + z*(1.7215219091086191E+00)))))))))));
-    ker[8] = 1.3136365370186135E+04 + z*(-5.8623313038274347E+04 + z*(1.1335001341875960E+05 + z*(-1.2489113703229751E+05 + z*(8.6425493435991288E+04 + z*(-3.8657354724013821E+04 + z*(1.0779131453134616E+04 + z*(-1.4992527030548747E+03 + z*(-7.9857427421126204E+01 + z*(7.1572272057939983E+01 + z*(-9.8886360698207305E+00 + z*(-5.4050464455680780E-01)))))))))));
-  } else if (w==10) {
-    ker[0] = 2.2594586605749264E+04 + z*(1.0729981697645642E+05 + z*(2.2340399734184606E+05 + z*(2.6917433004353486E+05 + z*(2.0818422772177903E+05 + z*(1.0781139496011091E+05 + z*(3.7380102688153558E+04 + z*(8.1238936393894646E+03 + z*(7.8515926628982663E+02 + z*(-1.0147176570537010E+02 + z*(-4.3161545259389186E+01 + z*(-4.2916172038214198E+00 + z*(3.5357495063798372E-01))))))))))));
-    ker[1] = 1.3595989066786593E+07 + z*(3.0651490267742988E+07 + z*(3.0258214643190462E+07 + z*(1.6875651476661228E+07 + z*(5.6084730690362519E+06 + z*(9.9202615851199068E+05 + z*(1.2716675000355666E+04 + z*(-3.4872365530450072E+04 + z*(-6.6607899119372642E+03 + z*(-3.5304284185385157E+01 + z*(1.5498490981579428E+02 + z*(1.7402146071148604E+01 + z*(-1.2828127001656939E+00))))))))))));
-    ker[2] = 4.4723032442444897E+08 + z*(5.9387966085130465E+08 + z*(3.1512411458738232E+08 + z*(7.4664745481963441E+07 + z*(1.4435118192351763E+06 + z*(-3.3266265543962116E+06 + z*(-6.2163527451774501E+05 + z*(2.3913680325196314E+04 + z*(2.0167398338513311E+04 + z*(1.3576976854876134E+03 + z*(-3.1771250774232175E+02 + z*(-4.7947588069135868E+01 + z*(2.4090120576065592E+00))))))))))));
-    ker[3] = 3.3781755837397518E+09 + z*(2.4434902657508330E+09 + z*(4.3618276932319808E+08 + z*(-9.5882157211118385E+07 + z*(-4.0063869969544649E+07 + z*(-4.8557049011479173E+05 + z*(1.4157962667184104E+06 + z*(1.2428850301830019E+05 + z*(-2.8951401344519112E+04 + z*(-4.3921059353471856E+03 + z*(3.7215448796427023E+02 + z*(9.2697698088029625E+01 + z*(-2.6448903316232837E+00))))))))))));
-    ker[4] = 8.6836783895849819E+09 + z*(2.0073077861288922E+09 + z*(-7.8178848450497293E+08 + z*(-2.0622994435532519E+08 + z*(3.2803674392747045E+07 + z*(1.0176155522772279E+07 + z*(-8.4419693137680157E+05 + z*(-3.2158255329716846E+05 + z*(1.4622828142848679E+04 + z*(7.3232085271125388E+03 + z*(-1.7181762832770994E+02 + z*(-1.2821427596894478E+02 + z*(1.1811556247055470E+00))))))))))));
-    ker[5] = 8.6836783895849762E+09 + z*(-2.0073077861288943E+09 + z*(-7.8178848450497019E+08 + z*(2.0622994435532743E+08 + z*(3.2803674392746095E+07 + z*(-1.0176155522772269E+07 + z*(-8.4419693137743860E+05 + z*(3.2158255329951923E+05 + z*(1.4622828143544031E+04 + z*(-7.3232085273978546E+03 + z*(-1.7181763036843782E+02 + z*(1.2821427705670308E+02 + z*(1.1811572031372566E+00))))))))))));
-    ker[6] = 3.3781755837397494E+09 + z*(-2.4434902657508330E+09 + z*(4.3618276932319826E+08 + z*(9.5882157211118177E+07 + z*(-4.0063869969546899E+07 + z*(4.8557049011678610E+05 + z*(1.4157962667189445E+06 + z*(-1.2428850301867779E+05 + z*(-2.8951401346900999E+04 + z*(4.3921059367737662E+03 + z*(3.7215448789408123E+02 + z*(-9.2697698297776569E+01 + z*(-2.6448918574427118E+00))))))))))));
-    ker[7] = 4.4723032442444897E+08 + z*(-5.9387966085130453E+08 + z*(3.1512411458738232E+08 + z*(-7.4664745481963515E+07 + z*(1.4435118192351642E+06 + z*(3.3266265543963453E+06 + z*(-6.2163527451771160E+05 + z*(-2.3913680325277423E+04 + z*(2.0167398338398041E+04 + z*(-1.3576976854043962E+03 + z*(-3.1771250773692140E+02 + z*(4.7947588093524907E+01 + z*(2.4090119348381340E+00))))))))))));
-    ker[8] = 1.3595989066786474E+07 + z*(-3.0651490267742816E+07 + z*(3.0258214643190313E+07 + z*(-1.6875651476661161E+07 + z*(5.6084730690362034E+06 + z*(-9.9202615851196018E+05 + z*(1.2716675000340010E+04 + z*(3.4872365530457188E+04 + z*(-6.6607899119505255E+03 + z*(3.5304284185385157E+01 + z*(1.5498490982186786E+02 + z*(-1.7402146074502035E+01 + z*(-1.2828126997546137E+00))))))))))));
-    ker[9] = 2.2594586605749344E+04 + z*(-1.0729981697645638E+05 + z*(2.2340399734184548E+05 + z*(-2.6917433004353428E+05 + z*(2.0818422772177853E+05 + z*(-1.0781139496011072E+05 + z*(3.7380102688153442E+04 + z*(-8.1238936393894255E+03 + z*(7.8515926628967964E+02 + z*(1.0147176570550941E+02 + z*(-4.3161545259547800E+01 + z*(4.2916172038452141E+00 + z*(3.5357495065519018E-01))))))))))));
-  } else if (w==11) {
-    ker[0] = 3.7794653219809625E+04 + z*(1.8969206922085886E+05 + z*(4.2138380313901440E+05 + z*(5.4814313598122005E+05 + z*(4.6495183529254980E+05 + z*(2.7021781043532980E+05 + z*(1.0933249308680627E+05 + z*(3.0203516161820498E+04 + z*(5.1670143574922731E+03 + z*(3.0888018539740131E+02 + z*(-8.3747489794189363E+01 + z*(-2.2640047135517630E+01 + z*(-1.6306382886201207E+00 + z*(2.4409286933062832E-01)))))))))))));
-    ker[1] = 3.4782300224660739E+07 + z*(8.4769319065313652E+07 + z*(9.2050522922791913E+07 + z*(5.8085130777589552E+07 + z*(2.3067199578027144E+07 + z*(5.6764510325100143E+06 + z*(6.9586821127987828E+05 + z*(-3.6879059542768438E+04 + z*(-2.8613147115372190E+04 + z*(-3.7949446187471626E+03 + z*(1.1948077479405792E+02 + z*(9.0840898563949466E+01 + z*(7.3325946591320434E+00 + z*(-7.8803147494205206E-01)))))))))))));
-    ker[2] = 1.6188020733727551E+09 + z*(2.4230555767723408E+09 + z*(1.5259983101266613E+09 + z*(4.9484006166551048E+08 + z*(6.9832590192482382E+07 + z*(-5.5650761736748898E+06 + z*(-3.6860240321937902E+06 + z*(-4.1141031216788280E+05 + z*(4.3560195427081359E+04 + z*(1.4313303204988082E+04 + z*(4.8528498015072080E+02 + z*(-2.1597187544386938E+02 + z*(-2.3241017682854558E+01 + z*(1.6467143111023508E+00)))))))))))));
-    ker[3] = 1.7196758809615005E+10 + z*(1.5439732722639101E+10 + z*(4.7070559561237173E+09 + z*(1.6222124676640952E+08 + z*(-2.2024799260683522E+08 + z*(-3.9907385617900200E+07 + z*(2.7428169457736355E+06 + z*(1.4111389975267777E+06 + z*(4.8438679582765450E+04 + z*(-2.6681600235594462E+04 + z*(-2.5024391114755094E+03 + z*(3.1511229111443720E+02 + z*(5.1715494398901185E+01 + z*(-2.1898240024594333E+00)))))))))))));
-    ker[4] = 6.3754384857724617E+10 + z*(2.7112836839612309E+10 + z*(-1.2448027572952359E+09 + z*(-2.0440440381345339E+09 + z*(-1.2820270942588677E+08 + z*(7.2453390663687646E+07 + z*(8.3392008440593518E+06 + z*(-1.5914376635331670E+06 + z*(-2.5856630639231802E+05 + z*(2.3856005166166615E+04 + z*(5.3511195318669425E+03 + z*(-2.4856617998395282E+02 + z*(-8.2673000279130790E+01 + z*(1.6350121154971753E+00)))))))))))));
-    ker[5] = 9.7196447559193497E+10 + z*(2.5609833368650835E-06 + z*(-1.0161446790279301E+10 + z*(9.1416457449079640E-06 + z*(5.1017181199129778E+08 + z*(1.2300109686762266E-05 + z*(-1.6402201025046850E+07 + z*(9.4095582602103753E-06 + z*(3.7994883866738499E+05 + z*(8.6424601730164351E-06 + z*(-6.7655484107390166E+03 + z*(6.1683918215190516E-06 + z*(9.6489719151212370E+01 + z*(3.1067137654111114E-06)))))))))))));
-    ker[6] = 6.3754384857724617E+10 + z*(-2.7112836839612328E+10 + z*(-1.2448027572952316E+09 + z*(2.0440440381345336E+09 + z*(-1.2820270942588474E+08 + z*(-7.2453390663684472E+07 + z*(8.3392008440698013E+06 + z*(1.5914376635379130E+06 + z*(-2.5856630640319458E+05 + z*(-2.3856005155895236E+04 + z*(5.3511195362291774E+03 + z*(2.4856618439352349E+02 + z*(-8.2673010381149226E+01 + z*(-1.6350114182053190E+00)))))))))))));
-    ker[7] = 1.7196758809614998E+10 + z*(-1.5439732722639105E+10 + z*(4.7070559561237268E+09 + z*(-1.6222124676640788E+08 + z*(-2.2024799260683942E+08 + z*(3.9907385617899075E+07 + z*(2.7428169457778852E+06 + z*(-1.4111389975247320E+06 + z*(4.8438679579510936E+04 + z*(2.6681600234453199E+04 + z*(-2.5024391131167667E+03 + z*(-3.1511228757800421E+02 + z*(5.1715494328769353E+01 + z*(2.1898231010467075E+00)))))))))))));
-    ker[8] = 1.6188020733727560E+09 + z*(-2.4230555767723408E+09 + z*(1.5259983101266615E+09 + z*(-4.9484006166551071E+08 + z*(6.9832590192482322E+07 + z*(5.5650761736749066E+06 + z*(-3.6860240321937371E+06 + z*(4.1141031216776522E+05 + z*(4.3560195426766244E+04 + z*(-1.4313303205083188E+04 + z*(4.8528498019392708E+02 + z*(2.1597187557069353E+02 + z*(-2.3241018024860580E+01 + z*(-1.6467144498332880E+00)))))))))))));
-    ker[9] = 3.4782300224660769E+07 + z*(-8.4769319065313682E+07 + z*(9.2050522922791913E+07 + z*(-5.8085130777589560E+07 + z*(2.3067199578027155E+07 + z*(-5.6764510325099993E+06 + z*(6.9586821127989423E+05 + z*(3.6879059542750314E+04 + z*(-2.8613147115376054E+04 + z*(3.7949446187583080E+03 + z*(1.1948077480620087E+02 + z*(-9.0840898570046704E+01 + z*(7.3325946448852415E+00 + z*(7.8803147565170728E-01)))))))))))));
-    ker[10] = 3.7794653219808984E+04 + z*(-1.8969206922085711E+05 + z*(4.2138380313901149E+05 + z*(-5.4814313598121714E+05 + z*(4.6495183529254742E+05 + z*(-2.7021781043532846E+05 + z*(1.0933249308680571E+05 + z*(-3.0203516161820549E+04 + z*(5.1670143574922913E+03 + z*(-3.0888018539728523E+02 + z*(-8.3747489794426258E+01 + z*(2.2640047135565219E+01 + z*(-1.6306382886460551E+00 + z*(-2.4409286929438936E-01)))))))))))));
-  } else if (w==12) {
-    ker[0] = 6.1722991679852908E+04 + z*(3.2561466099406168E+05 + z*(7.6621098001581512E+05 + z*(1.0657807616803218E+06 + z*(9.7829638830158755E+05 + z*(6.2536876825114002E+05 + z*(2.8527714307528478E+05 + z*(9.2873647411234080E+04 + z*(2.0817947751046438E+04 + z*(2.7986023314783361E+03 + z*(6.7849020474048089E+01 + z*(-5.4577020998836872E+01 + z*(-1.0538365872268786E+01 + z*(-4.6087004144309118E-01 + z*(1.0091617602357156E-01))))))))))))));
-    ker[1] = 8.4789650417103648E+07 + z*(2.2112758120210618E+08 + z*(2.6026568260310286E+08 + z*(1.8144472126890984E+08 + z*(8.2222351241519913E+07 + z*(2.4702814073680203E+07 + z*(4.6266378435690766E+06 + z*(3.6630046787425119E+05 + z*(-5.5660303410315042E+04 + z*(-1.9404411093655592E+04 + z*(-1.7921351308204744E+03 + z*(1.3637112867242237E+02 + z*(4.6577222488645518E+01 + z*(2.5969759128998060E+00 + z*(-4.1640313379289134E-01))))))))))))));
-    ker[2] = 5.4431675199498701E+09 + z*(8.9911609880089817E+09 + z*(6.4524338253008652E+09 + z*(2.5524827004349842E+09 + z*(5.5676911894064474E+08 + z*(4.1488431554846466E+07 + z*(-1.0665598090790771E+07 + z*(-3.1271047224730137E+06 + z*(-1.9519783923444615E+05 + z*(4.3922625000519314E+04 + z*(8.4980694686552797E+03 + z*(4.5513616580246023E+01 + z*(-1.2606964198473415E+02 + z*(-9.6946932216381381E+00 + z*(9.5655538149652608E-01))))))))))))));
-    ker[3] = 7.8788892335272232E+10 + z*(8.3059508064200943E+10 + z*(3.3729904113826820E+10 + z*(5.2112383911371660E+09 + z*(-4.8739037675427330E+08 + z*(-2.9274790542418826E+08 + z*(-2.6048960239891130E+07 + z*(4.8612412939252760E+06 + z*(1.0804817251338551E+06 + z*(-7.6450317451901383E+03 + z*(-1.9742624859769410E+04 + z*(-1.1174001367986359E+03 + z*(2.1881091668968099E+02 + z*(2.4990041962121211E+01 + z*(-1.4751933188367841E+00))))))))))))));
-    ker[4] = 4.0355760945670044E+11 + z*(2.3965569143469864E+11 + z*(2.8555202212474091E+10 + z*(-1.0268350564014645E+10 + z*(-2.7153428193078227E+09 + z*(1.0742154109191516E+08 + z*(9.1597254427317813E+07 + z*(3.3820440907796426E+06 + z*(-1.8264985852555393E+06 + z*(-1.5273911974273989E+05 + z*(2.4620674845030797E+04 + z*(3.2018769312434206E+03 + z*(-2.3273399614976032E+02 + z*(-4.6013909139329137E+01 + z*(1.4545209841258784E+00))))))))))))));
-    ker[5] = 8.8071481911347949E+11 + z*(1.6939286803305212E+11 + z*(-6.8998572040731537E+10 + z*(-1.4763245309081306E+10 + z*(2.5627633609246106E+09 + z*(6.2185168968032193E+08 + z*(-5.9794495983264342E+07 + z*(-1.6880127953704204E+07 + z*(9.7602844968061335E+05 + z*(3.3223441458516393E+05 + z*(-1.1676544851227827E+04 + z*(-5.0580351396215219E+03 + z*(1.0274275204276027E+02 + z*(6.2056985032913090E+01 + z*(-6.1864795320187016E-01))))))))))))));
-    ker[6] = 8.8071481911347961E+11 + z*(-1.6939286803305203E+11 + z*(-6.8998572040731445E+10 + z*(1.4763245309081314E+10 + z*(2.5627633609246163E+09 + z*(-6.2185168968012476E+08 + z*(-5.9794495983220413E+07 + z*(1.6880127953756198E+07 + z*(9.7602844962902542E+05 + z*(-3.3223441441930021E+05 + z*(-1.1676544869194569E+04 + z*(5.0580351683422405E+03 + z*(1.0274270265494516E+02 + z*(-6.2056925855365186E+01 + z*(-6.1865139394069668E-01))))))))))))));
-    ker[7] = 4.0355760945670044E+11 + z*(-2.3965569143469864E+11 + z*(2.8555202212474079E+10 + z*(1.0268350564014671E+10 + z*(-2.7153428193078651E+09 + z*(-1.0742154109184742E+08 + z*(9.1597254427343085E+07 + z*(-3.3820440907614031E+06 + z*(-1.8264985852963410E+06 + z*(1.5273911979752057E+05 + z*(2.4620674845030626E+04 + z*(-3.2018769242193171E+03 + z*(-2.3273401859852868E+02 + z*(4.6013921000662158E+01 + z*(1.4545168868971416E+00))))))))))))));
-    ker[8] = 7.8788892335272430E+10 + z*(-8.3059508064201080E+10 + z*(3.3729904113826824E+10 + z*(-5.2112383911371059E+09 + z*(-4.8739037675430620E+08 + z*(2.9274790542423087E+08 + z*(-2.6048960239921503E+07 + z*(-4.8612412938993908E+06 + z*(1.0804817251124913E+06 + z*(7.6450317512768806E+03 + z*(-1.9742624831436660E+04 + z*(1.1174000998831286E+03 + z*(2.1881091865396468E+02 + z*(-2.4990037445376750E+01 + z*(-1.4751750235177532E+00))))))))))))));
-    ker[9] = 5.4431675199498835E+09 + z*(-8.9911609880089989E+09 + z*(6.4524338253008757E+09 + z*(-2.5524827004349871E+09 + z*(5.5676911894064546E+08 + z*(-4.1488431554843128E+07 + z*(-1.0665598090794146E+07 + z*(3.1271047224752530E+06 + z*(-1.9519783923503032E+05 + z*(-4.3922624998141677E+04 + z*(8.4980694630406069E+03 + z*(-4.5513609243969356E+01 + z*(-1.2606964777237258E+02 + z*(9.6946954085586885E+00 + z*(9.5655144677795612E-01))))))))))))));
-    ker[10] = 8.4789650417103708E+07 + z*(-2.2112758120210618E+08 + z*(2.6026568260310274E+08 + z*(-1.8144472126890984E+08 + z*(8.2222351241519868E+07 + z*(-2.4702814073680237E+07 + z*(4.6266378435690673E+06 + z*(-3.6630046787425695E+05 + z*(-5.5660303410363231E+04 + z*(1.9404411093637758E+04 + z*(-1.7921351308312935E+03 + z*(-1.3637112867730119E+02 + z*(4.6577222453584369E+01 + z*(-2.5969759201692755E+00 + z*(-4.1640314572497039E-01))))))))))))));
-    ker[11] = 6.1722991679871957E+04 + z*(-3.2561466099404311E+05 + z*(7.6621098001583829E+05 + z*(-1.0657807616803099E+06 + z*(9.7829638830161188E+05 + z*(-6.2536876825112454E+05 + z*(2.8527714307530399E+05 + z*(-9.2873647411217215E+04 + z*(2.0817947751063632E+04 + z*(-2.7986023314644049E+03 + z*(6.7849020488592075E+01 + z*(5.4577021011726984E+01 + z*(-1.0538365860573146E+01 + z*(4.6087004744129911E-01 + z*(1.0091618212062017E-01))))))))))))));
-  } else if (w==13) {
-    ker[0] = 9.8715725867495363E+04 + z*(5.4491110456935549E+05 + z*(1.3504711883426071E+06 + z*(1.9937206140846491E+06 + z*(1.9607419630386413E+06 + z*(1.3593773865640305E+06 + z*(6.8417206432039209E+05 + z*(2.5248269397037517E+05 + z*(6.7530100970876694E+04 + z*(1.2421368748961073E+04 + z*(1.2904654687550299E+03 + z*(-1.9043622268674213E+01 + z*(-3.0093984465361217E+01 + z*(-4.3050286009489040E+00 + z*(-1.0957333716725008E-01 + z*(6.4700345786605579E-02)))))))))))))));
-    ker[1] = 1.9828875496808097E+08 + z*(5.4903670125539351E+08 + z*(6.9286979077463162E+08 + z*(5.2512029493765980E+08 + z*(2.6425362558103892E+08 + z*(9.1556445104158267E+07 + z*(2.1561705510027152E+07 + z*(3.0985559672616189E+06 + z*(1.2373362326658823E+05 + z*(-5.0576243647011936E+04 + z*(-1.1169946055009055E+04 + z*(-6.8296542209516542E+02 + z*(9.8972865724808671E+01 + z*(2.1108975724659501E+01 + z*(7.2949317004436565E-01 + z*(-1.9388585893355106E-01)))))))))))))));
-    ker[2] = 1.7196758809614983E+10 + z*(3.0879465445278183E+10 + z*(2.4618123595484577E+10 + z*(1.1253303793811750E+10 + z*(3.1171259341747193E+09 + z*(4.7074012944133747E+08 + z*(7.5785249893055111E+06 + z*(-1.1816517087616559E+07 + z*(-2.1245597183281910E+06 + z*(-4.8878193436902722E+04 + z*(3.3275109713863385E+04 + z*(4.2702512274202591E+03 + z*(-9.7437038666761538E+01 + z*(-6.4297198812570272E+01 + z*(-3.4300816058693728E+00 + z*(4.8498196904280277E-01)))))))))))))));
-    ker[3] = 3.3083776881353577E+11 + z*(3.9588436413399969E+11 + z*(1.9493985627722607E+11 + z*(4.6205527735932152E+10 + z*(2.9839860297839913E+09 + z*(-1.1192579335657008E+09 + z*(-2.7456096030221754E+08 + z*(-8.2958498770184973E+06 + z*(5.1047323238754412E+06 + z*(6.5307896872028301E+05 + z*(-3.1765222274236821E+04 + z*(-1.2165497317825058E+04 + z*(-3.5079928405373198E+02 + z*(1.2922884632277874E+02 + z*(1.0470054474579324E+01 + z*(-8.4505831021230793E-01)))))))))))))));
-    ker[4] = 2.2668873993375439E+12 + z*(1.6860562536749778E+12 + z*(3.9422703517046350E+11 + z*(-1.1607472377983305E+10 + z*(-1.9585031917561897E+10 + z*(-2.1090780087868555E+09 + z*(3.4589095671054310E+08 + z*(8.0546642347355247E+07 + z*(-1.4139444405488928E+06 + z*(-1.5497610127060430E+06 + z*(-5.9810982085323274E+04 + z*(1.9423733298269544E+04 + z*(1.5699250566648977E+03 + z*(-1.6991812716212596E+02 + z*(-2.2292134950656113E+01 + z*(1.0096624953668054E+00)))))))))))))));
-    ker[5] = 6.7734720591167568E+12 + z*(2.4256447893117891E+12 + z*(-1.8678883613919861E+11 + z*(-1.6305241755642313E+11 + z*(-5.0666917387065792E+09 + z*(5.2270306737951984E+09 + z*(4.0256106808894646E+08 + z*(-1.0594657799485898E+08 + z*(-1.1818267555096827E+07 + z*(1.5137725917321201E+06 + z*(2.2355863038592847E+05 + z*(-1.6010024066956401E+04 + z*(-3.1287439837941820E+03 + z*(1.2655005901719436E+02 + z*(3.4570827323582719E+01 + z*(-7.1176997517188334E-01)))))))))))))));
-    ker[6] = 9.6695220682534785E+12 + z*(-5.5583944938791784E-05 + z*(-8.5538079834550110E+11 + z*(3.5385440504350348E-04 + z*(3.6568794485480583E+10 + z*(5.6467240041521856E-04 + z*(-1.0074306926603404E+09 + z*(2.1816722293163801E-04 + z*(2.0121548578624789E+07 + z*(4.1615986404011299E-04 + z*(-3.1083591705219547E+05 + z*(3.4018642874429026E-04 + z*(3.8692196309709061E+03 + z*(9.2483537895948854E-05 + z*(-3.9923523442753932E+01 + z*(1.8993034357560573E-04)))))))))))))));
-    ker[7] = 6.7734720591167432E+12 + z*(-2.4256447893117847E+12 + z*(-1.8678883613919730E+11 + z*(1.6305241755642365E+11 + z*(-5.0666917387057562E+09 + z*(-5.2270306737934217E+09 + z*(4.0256106809081393E+08 + z*(1.0594657799424352E+08 + z*(-1.1818267557079868E+07 + z*(-1.5137725918538549E+06 + z*(2.2355863445202672E+05 + z*(1.6010021599471667E+04 + z*(-3.1287462825615335E+03 + z*(-1.2655066232531748E+02 + z*(3.4573264959502886E+01 + z*(7.1071470529800751E-01)))))))))))))));
-    ker[8] = 2.2668873993375430E+12 + z*(-1.6860562536749768E+12 + z*(3.9422703517046375E+11 + z*(1.1607472377982582E+10 + z*(-1.9585031917561817E+10 + z*(2.1090780087880819E+09 + z*(3.4589095670997137E+08 + z*(-8.0546642347497791E+07 + z*(-1.4139444401348191E+06 + z*(1.5497610130469005E+06 + z*(-5.9810982721084511E+04 + z*(-1.9423732817821805E+04 + z*(1.5699252631958864E+03 + z*(1.6991805207569072E+02 + z*(-2.2292358612963266E+01 + z*(-1.0093759853494892E+00)))))))))))))));
-    ker[9] = 3.3083776881353503E+11 + z*(-3.9588436413399890E+11 + z*(1.9493985627722589E+11 + z*(-4.6205527735932213E+10 + z*(2.9839860297838497E+09 + z*(1.1192579335658383E+09 + z*(-2.7456096030236483E+08 + z*(8.2958498771036500E+06 + z*(5.1047323236516044E+06 + z*(-6.5307896856811445E+05 + z*(-3.1765222464963932E+04 + z*(1.2165497483905752E+04 + z*(-3.5079944793112952E+02 + z*(-1.2922893667436634E+02 + z*(1.0470042004916014E+01 + z*(8.4513368663187038E-01)))))))))))))));
-    ker[10] = 1.7196758809614998E+10 + z*(-3.0879465445278183E+10 + z*(2.4618123595484566E+10 + z*(-1.1253303793811750E+10 + z*(3.1171259341747184E+09 + z*(-4.7074012944133127E+08 + z*(7.5785249893030487E+06 + z*(1.1816517087615721E+07 + z*(-2.1245597183309775E+06 + z*(4.8878193438804832E+04 + z*(3.3275109714208855E+04 + z*(-4.2702512286689680E+03 + z*(-9.7437041893750632E+01 + z*(6.4297198424711908E+01 + z*(-3.4300810538570281E+00 + z*(-4.8498289797755090E-01)))))))))))))));
-    ker[11] = 1.9828875496807891E+08 + z*(-5.4903670125538898E+08 + z*(6.9286979077462614E+08 + z*(-5.2512029493765628E+08 + z*(2.6425362558103728E+08 + z*(-9.1556445104157984E+07 + z*(2.1561705510027405E+07 + z*(-3.0985559672621777E+06 + z*(1.2373362326702787E+05 + z*(5.0576243646433126E+04 + z*(-1.1169946054555618E+04 + z*(6.8296542153908558E+02 + z*(9.8972866189610414E+01 + z*(-2.1108976207523057E+01 + z*(7.2949352113279253E-01 + z*(1.9388556864144227E-01)))))))))))))));
-    ker[12] = 9.8715725867496090E+04 + z*(-5.4491110456935526E+05 + z*(1.3504711883426069E+06 + z*(-1.9937206140846489E+06 + z*(1.9607419630386417E+06 + z*(-1.3593773865640305E+06 + z*(6.8417206432039209E+05 + z*(-2.5248269397037517E+05 + z*(6.7530100970876316E+04 + z*(-1.2421368748961073E+04 + z*(1.2904654687545376E+03 + z*(1.9043622268312891E+01 + z*(-3.0093984465884773E+01 + z*(4.3050286009485790E+00 + z*(-1.0957333740315604E-01 + z*(-6.4700346061994291E-02)))))))))))))));
-  } else if (w==14) {
-    ker[0] = 1.5499533202966207E+05 + z*(8.9188339002980455E+05 + z*(2.3170473769379663E+06 + z*(3.6089249230396422E+06 + z*(3.7733555140851745E+06 + z*(2.8079157920112358E+06 + z*(1.5361613559533111E+06 + z*(6.2759409419592959E+05 + z*(1.9151404903933613E+05 + z*(4.2715272622845026E+04 + z*(6.4806786522793900E+03 + z*(4.9913632908459954E+02 + z*(-3.3076333188134086E+01 + z*(-1.4394533627743886E+01 + z*(-1.5925952284027161E+00 + z*(1.5984868520881029E-02 + z*(1.0086883464628714E-02))))))))))))))));
-    ker[1] = 4.4723032442444688E+08 + z*(1.3065352538728635E+09 + z*(1.7532505043698256E+09 + z*(1.4278058213962190E+09 + z*(7.8376718099107409E+08 + z*(3.0340753492383724E+08 + z*(8.3513615594416574E+07 + z*(1.5741723594963098E+07 + z*(1.7156606891563335E+06 + z*(-2.2565910611953568E+03 + z*(-3.5474227032974472E+04 + z*(-5.5416668524952684E+03 + z*(-1.8970588563697331E+02 + z*(5.7000699089242815E+01 + z*(8.5113930215357829E+00 + z*(1.2876175212962959E-01 + z*(-8.0790909102720715E-02))))))))))))))));
-    ker[2] = 5.1495083701694740E+10 + z*(9.9400185225815567E+10 + z*(8.6523535958354309E+10 + z*(4.4296625537022423E+10 + z*(1.4443117772349569E+10 + z*(2.9498136661747241E+09 + z*(3.0077547202708024E+08 + z*(-1.5632610223406436E+07 + z*(-9.7733523156688716E+06 + z*(-1.1769776156959014E+06 + z*(1.8237100709385861E+04 + z*(2.0614058717617296E+04 + z*(1.8160423493164808E+03 + z*(-1.0101142663923416E+02 + z*(-2.8993523187012922E+01 + z*(-9.8358742969175483E-01 + z*(2.1844663774917339E-01))))))))))))))));
-    ker[3] = 1.2904576022918071E+12 + z*(1.7136059013402405E+12 + z*(9.7455289065487354E+11 + z*(2.9466624630419781E+11 + z*(4.3197433307418671E+10 + z*(-6.2820200387919831E+08 + z*(-1.3749596754067802E+09 + z*(-1.9294824907078514E+08 + z*(4.2982266233154163E+06 + z*(4.0078399907813077E+06 + z*(3.0934714629696816E+05 + z*(-3.2285139072943130E+04 + z*(-6.3715703355644328E+03 + z*(-3.2954197414395189E+01 + z*(6.6373454994590404E+01 + z*(3.7711523389360830E+00 + z*(-4.2239625362494765E-01))))))))))))))));
-    ker[4] = 1.1534950432785506E+13 + z*(1.0144146621675832E+13 + z*(3.2977972139362314E+12 + z*(3.1903621584503235E+11 + z*(-7.6585042240585556E+10 + z*(-2.2372008390623215E+10 + z*(-6.6733027297557127E+08 + z*(4.4643806532434595E+08 + z*(5.1660907884347722E+07 + z*(-3.8951858063335596E+06 + z*(-1.0394703931686131E+06 + z*(-5.3099550821623425E+03 + z*(1.2525624574329036E+04 + z*(6.1417879182394654E+02 + z*(-1.0329574518449559E+02 + z*(-9.4305498095765508E+00 + z*(5.8751729633944827E-01))))))))))))))));
-    ker[5] = 4.5650102198520484E+13 + z*(2.3034036018490715E+13 + z*(1.7874626001697781E+12 + z*(-9.8834691411254565E+11 + z*(-1.8569640140763062E+11 + z*(1.5217518660584890E+10 + z*(5.9590333632819109E+09 + z*(1.5178998385244830E+07 + z*(-1.1279400211155911E+08 + z*(-5.0944610754510267E+06 + z*(1.4743920333143482E+06 + z*(1.1559000502166932E+05 + z*(-1.4199806452802783E+04 + z*(-1.6177283846697430E+03 + z*(1.0280184257681817E+02 + z*(1.6842854581416674E+01 + z*(-5.3309419914634570E-01))))))))))))))));
-    ker[6] = 8.8830582190032641E+13 + z*(1.4630967270448871E+13 + z*(-6.1480918082633916E+12 + z*(-1.1072264714919226E+12 + z*(2.0385335192657199E+11 + z*(4.0682590266891922E+10 + z*(-4.3025685566870070E+09 + z*(-9.6771139891725647E+08 + z*(6.4701089573962681E+07 + z*(1.6765992446914168E+07 + z*(-7.3356882447856572E+05 + z*(-2.2569743259261423E+05 + z*(6.4441892296909591E+03 + z*(2.4593386157454975E+03 + z*(-4.3896094875192006E+01 + z*(-2.2308566502972713E+01 + z*(2.0256318800830966E-01))))))))))))))));
-    ker[7] = 8.8830582190032641E+13 + z*(-1.4630967270448855E+13 + z*(-6.1480918082633975E+12 + z*(1.1072264714919316E+12 + z*(2.0385335192656519E+11 + z*(-4.0682590266869431E+10 + z*(-4.3025685566872711E+09 + z*(9.6771139892509627E+08 + z*(6.4701089571562663E+07 + z*(-1.6765992426657490E+07 + z*(-7.3356882916658197E+05 + z*(2.2569743616896842E+05 + z*(6.4441909537524216E+03 + z*(-2.4593322941165261E+03 + z*(-4.3899302208087086E+01 + z*(2.2308940200151390E+01 + z*(2.0382395473864703E-01))))))))))))))));
-    ker[8] = 4.5650102198520492E+13 + z*(-2.3034036018490719E+13 + z*(1.7874626001697690E+12 + z*(9.8834691411255151E+11 + z*(-1.8569640140762662E+11 + z*(-1.5217518660582748E+10 + z*(5.9590333632806673E+09 + z*(-1.5178998381042883E+07 + z*(-1.1279400211012064E+08 + z*(5.0944610781778870E+06 + z*(1.4743920305501707E+06 + z*(-1.1559000130545651E+05 + z*(-1.4199808176873401E+04 + z*(1.6177291239900730E+03 + z*(1.0280039795628096E+02 + z*(-1.6841512668820517E+01 + z*(-5.3414483808829394E-01))))))))))))))));
-    ker[9] = 1.1534950432785527E+13 + z*(-1.0144146621675846E+13 + z*(3.2977972139362285E+12 + z*(-3.1903621584503467E+11 + z*(-7.6585042240580856E+10 + z*(2.2372008390625935E+10 + z*(-6.6733027297523963E+08 + z*(-4.4643806533176166E+08 + z*(5.1660907891220264E+07 + z*(3.8951858062361716E+06 + z*(-1.0394703929917105E+06 + z*(5.3099543129458480E+03 + z*(1.2525626154733827E+04 + z*(-6.1417952013923764E+02 + z*(-1.0329511291885207E+02 + z*(9.4313524091989347E+00 + z*(5.8725463660395827E-01))))))))))))))));
-    ker[10] = 1.2904576022918074E+12 + z*(-1.7136059013402405E+12 + z*(9.7455289065487329E+11 + z*(-2.9466624630419769E+11 + z*(4.3197433307418686E+10 + z*(6.2820200387968791E+08 + z*(-1.3749596754067125E+09 + z*(1.9294824907065383E+08 + z*(4.2982266233826512E+06 + z*(-4.0078399907326135E+06 + z*(3.0934714631908614E+05 + z*(3.2285139142872020E+04 + z*(-6.3715704433222418E+03 + z*(3.2954100943010943E+01 + z*(6.6373435700858948E+01 + z*(-3.7710716543179599E+00 + z*(-4.2248161803898049E-01))))))))))))))));
-    ker[11] = 5.1495083701695107E+10 + z*(-9.9400185225815964E+10 + z*(8.6523535958354630E+10 + z*(-4.4296625537022621E+10 + z*(1.4443117772349669E+10 + z*(-2.9498136661747637E+09 + z*(3.0077547202709383E+08 + z*(1.5632610223392555E+07 + z*(-9.7733523157112263E+06 + z*(1.1769776157141617E+06 + z*(1.8237100665157792E+04 + z*(-2.0614058670790018E+04 + z*(1.8160422729911850E+03 + z*(1.0101142710333265E+02 + z*(-2.8993536490606409E+01 + z*(9.8361025494556609E-01 + z*(2.1840693098447247E-01))))))))))))))));
-    ker[12] = 4.4723032442444855E+08 + z*(-1.3065352538728662E+09 + z*(1.7532505043698275E+09 + z*(-1.4278058213962219E+09 + z*(7.8376718099107552E+08 + z*(-3.0340753492383808E+08 + z*(8.3513615594416171E+07 + z*(-1.5741723594963137E+07 + z*(1.7156606891560503E+06 + z*(2.2565910606306688E+03 + z*(-3.5474227033406372E+04 + z*(5.5416668533342381E+03 + z*(-1.8970588700495102E+02 + z*(-5.7000699100179844E+01 + z*(8.5113924808491728E+00 + z*(-1.2876100566420701E-01 + z*(-8.0791813278079661E-02))))))))))))))));
-    ker[13] = 1.5499533202970232E+05 + z*(-8.9188339002979454E+05 + z*(2.3170473769380399E+06 + z*(-3.6089249230396664E+06 + z*(3.7733555140852560E+06 + z*(-2.8079157920112377E+06 + z*(1.5361613559533576E+06 + z*(-6.2759409419590747E+05 + z*(1.9151404903936724E+05 + z*(-4.2715272622820135E+04 + z*(6.4806786523010323E+03 + z*(-4.9913632906195977E+02 + z*(-3.3076333168231550E+01 + z*(1.4394533639240331E+01 + z*(-1.5925952194145006E+00 + z*(-1.5984859433053292E-02 + z*(1.0086890728892184E-02))))))))))))))));
-  } else if (w==15) {
-    ker[0] = 2.3939707792241839E+05 + z*(1.4314487885226035E+06 + z*(3.8829497354762917E+06 + z*(6.3495763451755755E+06 + z*(7.0146619045520434E+06 + z*(5.5580012413990172E+06 + z*(3.2693972344231778E+06 + z*(1.4553539959296256E+06 + z*(4.9358776531681651E+05 + z*(1.2660319987326677E+05 + z*(2.3793325531458529E+04 + z*(2.9741655196834722E+03 + z*(1.5389176594899303E+02 + z*(-2.3857631312588978E+01 + z*(-6.1348505739169541E+00 + z*(-4.9671584513490097E-01 + z*(4.3460786767313729E-03 + z*(9.8937951662141730E-03)))))))))))))))));
-    ker[1] = 9.7700272582690191E+08 + z*(2.9961416925358453E+09 + z*(4.2473082696966448E+09 + z*(3.6841035003733950E+09 + z*(2.1782897863065763E+09 + z*(9.2345162185944164E+08 + z*(2.8610260147425205E+08 + z*(6.4136842048384041E+07 + z*(9.7772970960585065E+06 + z*(7.7519511328119377E+05 + z*(-4.2305332803808597E+04 + z*(-2.0687056403786246E+04 + z*(-2.3864418511494741E+03 + z*(-1.9651606133609231E+01 + z*(2.7872915855267404E+01 + z*(3.0617550953446115E+00 + z*(-1.3199600771767199E-02 + z*(-3.0452442308331003E-02)))))))))))))))));
-    ker[2] = 1.4715933396485257E+11 + z*(3.0273361232748438E+11 + z*(2.8414312556015540E+11 + z*(1.5965774278321045E+11 + z*(5.8897780310148087E+10 + z*(1.4522950934020109E+10 + z*(2.2348528403750563E+09 + z*(1.3622336582062906E+08 + z*(-2.3511574237987626E+07 + z*(-6.5244610661450895E+06 + z*(-5.2884156985535356E+05 + z*(3.3295507799709936E+04 + z*(1.0846266954249364E+04 + z*(6.4183083829803820E+02 + z*(-6.5819942538871970E+01 + z*(-1.1650665638578070E+01 + z*(-1.9412688562910244E-01 + z*(8.8476856295623851E-02)))))))))))))))));
-    ker[3] = 4.7242424833337158E+12 + z*(6.8507333793903584E+12 + z*(4.3688281331121411E+12 + z*(1.5630338683778201E+12 + z*(3.1953009601770325E+11 + z*(2.7025952371212009E+10 + z*(-3.4574515574242272E+09 + z*(-1.2131510424644001E+09 + z*(-1.0142613816641946E+08 + z*(9.0878257488052379E+06 + z*(2.5307340127864038E+06 + z*(1.0661145730323243E+05 + z*(-2.2940053396478714E+04 + z*(-2.8648433109641578E+03 + z*(5.1366231962952028E+01 + z*(3.0081586723089057E+01 + z*(1.1329433700669471E+00 + z*(-1.8863254393840059E-01)))))))))))))))));
-    ker[4] = 5.3987426629953594E+13 + z*(5.4192702756911000E+13 + z*(2.1823119508000543E+13 + z*(3.8749058615819268E+12 + z*(4.0651527029737198E+08 + z*(-1.2304576967641914E+11 + z*(-1.7480626463583939E+10 + z*(6.4322366984221375E+08 + z*(3.9421144218035364E+08 + z*(2.3116605621149920E+07 + z*(-4.0404175271559842E+06 + z*(-5.6644238105382060E+05 + z*(1.4780106121058996E+04 + z*(6.8249243722518859E+03 + z*(1.7213955398158618E+02 + z*(-5.4028356726202020E+01 + z*(-3.4442045795063887E+00 + z*(2.9436331238371949E-01)))))))))))))))));
-    ker[5] = 2.7580474290566078E+14 + z*(1.7551587948105309E+14 + z*(3.2228098609392094E+13 + z*(-2.7319740087723574E+12 + z*(-1.6379148273276064E+12 + z*(-1.0116752717202786E+11 + z*(3.1608597465540653E+10 + z*(4.5078753872047586E+09 + z*(-2.8449115593052310E+08 + z*(-8.7079594462079599E+07 + z*(-1.7519992360184138E+05 + z*(1.0874811616841732E+06 + z*(4.2663651769852157E+04 + z*(-9.7944325124827701E+03 + z*(-6.9658621010000411E+02 + z*(6.6077203078498044E+01 + z*(7.1737626956468912E+00 + z*(-3.4557352306805311E-01)))))))))))))))));
-    ker[6] = 7.0693378336533400E+14 + z*(2.1874615668430150E+14 + z*(-2.1833085454691789E+13 + z*(-1.3233342822865402E+13 + z*(-1.1568753137013029E+11 + z*(3.8517418245458325E+11 + z*(1.9879262560072273E+10 + z*(-7.1689413746930647E+09 + z*(-5.7549243243741119E+08 + z*(9.5542733739275128E+07 + z*(1.0146438805818636E+07 + z*(-9.6561270266008016E+05 + z*(-1.3047648013242516E+05 + z*(7.6177757600121276E+03 + z*(1.3192236112353403E+03 + z*(-4.7145500171928198E+01 + z*(-1.1098109271625262E+01 + z*(2.3017883073614101E-01)))))))))))))))));
-    ker[7] = 9.6196578554477775E+14 + z*(3.4316191014053393E-02 + z*(-7.3750710225100812E+13 + z*(6.1642230420317079E-02 + z*(2.7451653250460508E+12 + z*(1.0918347404432817E-01 + z*(-6.6148013553772224E+10 + z*(3.2906916833662987E-02 + z*(1.1608781631182449E+09 + z*(6.0548970733798724E-02 + z*(-1.5828545480742473E+07 + z*(1.5626594062671070E-02 + z*(1.7468401314164279E+05 + z*(1.8034307737205296E-02 + z*(-1.6054106225233884E+03 + z*(4.2118837140985958E-03 + z*(1.2385772358881393E+01 + z*(2.4430769636630701E-03)))))))))))))))));
-    ker[8] = 7.0693378336533400E+14 + z*(-2.1874615668430150E+14 + z*(-2.1833085454691820E+13 + z*(1.3233342822865449E+13 + z*(-1.1568753137012485E+11 + z*(-3.8517418245444312E+11 + z*(1.9879262560085339E+10 + z*(7.1689413746724453E+09 + z*(-5.7549243240763104E+08 + z*(-9.5542733661364838E+07 + z*(1.0146438778928882E+07 + z*(9.6561272951271443E+05 + z*(-1.3047645484607235E+05 + z*(-7.6177559127722052E+03 + z*(1.3192031991952242E+03 + z*(4.7167106663349848E+01 + z*(-1.1101471316239516E+01 + z*(-2.2905948186872507E-01)))))))))))))))));
-    ker[9] = 2.7580474290566125E+14 + z*(-1.7551587948105334E+14 + z*(3.2228098609392055E+13 + z*(2.7319740087723975E+12 + z*(-1.6379148273277261E+12 + z*(1.0116752717221135E+11 + z*(3.1608597465515747E+10 + z*(-4.5078753875009747E+09 + z*(-2.8449115600447333E+08 + z*(8.7079594608550951E+07 + z*(-1.7520004389869148E+05 + z*(-1.0874812528712249E+06 + z*(4.2663541429144650E+04 + z*(9.7944326623113047E+03 + z*(-6.9663961216547739E+02 + z*(-6.6048394423269173E+01 + z*(7.0913926025978853E+00 + z*(3.3773808099613395E-01)))))))))))))))));
-    ker[10] = 5.3987426629953766E+13 + z*(-5.4192702756911180E+13 + z*(2.1823119508000594E+13 + z*(-3.8749058615819365E+12 + z*(4.0651527029819238E+08 + z*(1.2304576967643665E+11 + z*(-1.7480626463576942E+10 + z*(-6.4322366985365331E+08 + z*(3.9421144214381480E+08 + z*(-2.3116605559600785E+07 + z*(-4.0404175770437294E+06 + z*(5.6644243308078672E+05 + z*(1.4780036296018619E+04 + z*(-6.8249058342322496E+03 + z*(1.7211403815802629E+02 + z*(5.4062906728994193E+01 + z*(-3.4845491148773502E+00 + z*(-2.9291324680547492E-01)))))))))))))))));
-    ker[11] = 4.7242424833337246E+12 + z*(-6.8507333793903701E+12 + z*(4.3688281331121479E+12 + z*(-1.5630338683778203E+12 + z*(3.1953009601770361E+11 + z*(-2.7025952371214943E+10 + z*(-3.4574515574198236E+09 + z*(1.2131510424608817E+09 + z*(-1.0142613816429654E+08 + z*(-9.0878257522138134E+06 + z*(2.5307340149977510E+06 + z*(-1.0661145838213131E+05 + z*(-2.2940053180976502E+04 + z*(2.8648407117981119E+03 + z*(5.1367579954366171E+01 + z*(-3.0081603709324451E+01 + z*(1.1323523856621058E+00 + z*(1.8808240940981219E-01)))))))))))))))));
-    ker[12] = 1.4715933396485263E+11 + z*(-3.0273361232748438E+11 + z*(2.8414312556015527E+11 + z*(-1.5965774278321042E+11 + z*(5.8897780310148087E+10 + z*(-1.4522950934020079E+10 + z*(2.2348528403750110E+09 + z*(-1.3622336582067037E+08 + z*(-2.3511574237995699E+07 + z*(6.5244610661298726E+06 + z*(-5.2884156989405944E+05 + z*(-3.3295507812197495E+04 + z*(1.0846266927315819E+04 + z*(-6.4183085438795774E+02 + z*(-6.5819957939661379E+01 + z*(1.1650672008416343E+01 + z*(-1.9414904754428672E-01 + z*(-8.8480036263997225E-02)))))))))))))))));
-    ker[13] = 9.7700272582690215E+08 + z*(-2.9961416925358458E+09 + z*(4.2473082696966434E+09 + z*(-3.6841035003733935E+09 + z*(2.1782897863065763E+09 + z*(-9.2345162185944211E+08 + z*(2.8610260147425193E+08 + z*(-6.4136842048384242E+07 + z*(9.7772970960588697E+06 + z*(-7.7519511328133650E+05 + z*(-4.2305332803937294E+04 + z*(2.0687056403630129E+04 + z*(-2.3864418517113058E+03 + z*(1.9651605969778377E+01 + z*(2.7872915947616441E+01 + z*(-3.0617551285208524E+00 + z*(-1.3200165079792004E-02 + z*(3.0452386410449440E-02)))))))))))))))));
-    ker[14] = 2.3939707792242285E+05 + z*(-1.4314487885226049E+06 + z*(3.8829497354762889E+06 + z*(-6.3495763451755764E+06 + z*(7.0146619045520443E+06 + z*(-5.5580012413990181E+06 + z*(3.2693972344231787E+06 + z*(-1.4553539959296256E+06 + z*(4.9358776531681546E+05 + z*(-1.2660319987326639E+05 + z*(2.3793325531459184E+04 + z*(-2.9741655196846405E+03 + z*(1.5389176594779781E+02 + z*(2.3857631312809222E+01 + z*(-6.1348505735855374E+00 + z*(4.9671584437353217E-01 + z*(4.3460782759443158E-03 + z*(-9.8937951540835553E-03)))))))))))))))));
-  } else if (w==16) {
-    ker[0] = 3.6434551345570839E+05 + z*(2.2576246485480359E+06 + z*(6.3730995546265077E+06 + z*(1.0896915393078227E+07 + z*(1.2655725616100594E+07 + z*(1.0609303958036326E+07 + z*(6.6544809363384582E+06 + z*(3.1906872142825006E+06 + z*(1.1821527096621769E+06 + z*(3.3854610744280310E+05 + z*(7.3893334077310064E+04 + z*(1.1778892113375481E+04 + z*(1.2019749667923656E+03 + z*(3.1189837632471693E+01 + z*(-1.2975319073401824E+01 + z*(-2.3155118729954247E+00 + z*(-1.5401723686076832E-01 + z*(1.1808835093099178E-02 + z*(1.0197234627513459E-04))))))))))))))))));
-    ker[1] = 2.0744705928579483E+09 + z*(6.6499571180086451E+09 + z*(9.9060026035198078E+09 + z*(9.0890343524593849E+09 + z*(5.7342804054544210E+09 + z*(2.6255609052371716E+09 + z*(8.9490403680928326E+08 + z*(2.2785946180651775E+08 + z*(4.2281234059839502E+07 + z*(5.2176984975081543E+06 + z*(2.6983804209559254E+05 + z*(-4.0077190108724200E+04 + z*(-1.0378455844500613E+04 + z*(-8.9083493807061564E+02 + z*(1.8283698218710011E+01 + z*(1.1938503634469159E+01 + z*(9.8067823888634464E-01 + z*(-2.5444299558662394E-02 + z*(-1.0460024144706743E-02))))))))))))))))));
-    ker[2] = 4.0355760945669995E+11 + z*(8.7873753526056287E+11 + z*(8.8097248605449023E+11 + z*(5.3565169504010010E+11 + z*(2.1822836608899570E+11 + z*(6.1673589426039413E+10 + z*(1.1882638725190845E+10 + z*(1.3744578972809248E+09 + z*(2.8723226058712766E+07 + z*(-2.0677283565079328E+07 + z*(-3.6415998561101072E+06 + z*(-1.8372552175909068E+05 + z*(2.6333352653155256E+04 + z*(4.9454293649337906E+03 + z*(1.7684015393859755E+02 + z*(-3.4150562973753665E+01 + z*(-4.1900843552415639E+00 + z*(-1.5661344238792723E-04 + z*(3.2484276751008172E-02))))))))))))))))));
-    ker[3] = 1.6364575388763029E+13 + z*(2.5606844387131066E+13 + z*(1.7953384130753688E+13 + z*(7.3004206720038701E+12 + z*(1.8300700858999690E+12 + z*(2.6044432099085333E+11 + z*(8.1552898137823076E+09 + z*(-4.3997172592883167E+09 + z*(-8.3553955857628822E+08 + z*(-3.5831818968518838E+07 + z*(8.4025485849181097E+06 + z*(1.3262878399160223E+06 + z*(1.7117060106301305E+04 + z*(-1.3124693635095375E+04 + z*(-1.1059917445033070E+03 + z*(4.8898615554511437E+01 + z*(1.2150534299778382E+01 + z*(2.5820071204205225E-01 + z*(-7.6529155676856003E-02))))))))))))))))));
-    ker[4] = 2.3514830376056538E+14 + z*(2.6313738449330153E+14 + z*(1.2398425545001662E+14 + z*(2.9692333044160066E+13 + z*(2.7770431049857676E+12 + z*(-3.5431628074578204E+11 + z*(-1.2575562817886868E+11 + z*(-9.2011130754043922E+09 + z*(1.2447304828823066E+09 + z*(2.6599346106412742E+08 + z*(4.9278860779345948E+06 + z*(-2.9738539927520575E+06 + z*(-2.5133287443653666E+05 + z*(1.5834784331991095E+04 + z*(3.1998168298121523E+03 + z*(1.5853185548633874E+01 + z*(-2.4763139606227178E+01 + z*(-1.0930950485268096E+00 + z*(1.3630970159476372E-01))))))))))))))))));
-    ker[5] = 1.5192201717462528E+15 + z*(1.1495095100701460E+15 + z*(3.0749346493041262E+14 + z*(1.6051737468109549E+13 + z*(-8.5034969223852568E+12 + z*(-1.6077602129636348E+12 + z*(2.7074695075907585E+10 + z*(3.4690551711832901E+10 + z*(2.1955280943585949E+09 + z*(-3.7992777977357000E+08 + z*(-5.1437033846752726E+07 + z*(1.9493509709529271E+06 + z*(6.4713914262131555E+05 + z*(6.9607870364081436E+03 + z*(-5.5988200120063057E+03 + z*(-2.4272678107130790E+02 + z*(3.6068014621628578E+01 + z*(2.6408492552008669E+00 + z*(-1.7034844660140105E-01))))))))))))))))));
-    ker[6] = 4.9956173084674090E+15 + z*(2.1932582707747560E+15 + z*(1.0259777520247159E+14 + z*(-9.1273329108089906E+13 + z*(-1.2846668467423438E+13 + z*(1.5534405614728977E+12 + z*(3.9453789461955023E+11 + z*(-9.4227043395047741E+09 + z*(-7.0514195726908512E+09 + z*(-1.3426914417466179E+08 + z*(8.7603898676325440E+07 + z*(4.1881949951139782E+06 + z*(-8.1634942572553246E+05 + z*(-5.9789871879430451E+04 + z*(5.9248751921324047E+03 + z*(6.0151276286907887E+02 + z*(-3.4346647779134791E+01 + z*(-4.4415763059111955E+00 + z*(5.0810563000817377E-01))))))))))))))))));
-    ker[7] = 8.9287666945127360E+15 + z*(1.2860244365132595E+15 + z*(-5.5291976457534325E+14 + z*(-8.5999306918502953E+13 + z*(1.6519076896571838E+13 + z*(2.8019935380857432E+12 + z*(-3.1679644857468066E+11 + z*(-5.9308465070198639E+10 + z*(4.3745141239718714E+09 + z*(9.1752051229224503E+08 + z*(-4.6199498412402093E+07 + z*(-1.1066749616505133E+07 + z*(3.8623935281825601E+05 + z*(1.0841726514394575E+05 + z*(-2.5990022806343668E+03 + z*(-8.8751856926690448E+02 + z*(1.3259903958585387E+01 + z*(6.8227366238712817E+00 + z*(4.5271530858828507E-01))))))))))))))))));
-    ker[8] = 8.9287666945127390E+15 + z*(-1.2860244365132600E+15 + z*(-5.5291976457534325E+14 + z*(8.5999306918502422E+13 + z*(1.6519076896572182E+13 + z*(-2.8019935380841978E+12 + z*(-3.1679644857392346E+11 + z*(5.9308465069336540E+10 + z*(4.3745141233600502E+09 + z*(-9.1752051129499328E+08 + z*(-4.6199498208604209E+07 + z*(1.1066749327519676E+07 + z*(3.8623876433339820E+05 + z*(-1.0841709685990328E+05 + z*(-2.5990962125709430E+03 + z*(8.8742942550355474E+02 + z*(1.2937147675617604E+01 + z*(-6.8186662643534008E+00 + z*(4.1898600671806535E-01))))))))))))))))));
-    ker[9] = 4.9956173084674090E+15 + z*(-2.1932582707747578E+15 + z*(1.0259777520247186E+14 + z*(9.1273329108089984E+13 + z*(-1.2846668467423555E+13 + z*(-1.5534405614724106E+12 + z*(3.9453789461966650E+11 + z*(9.4227043396350136E+09 + z*(-7.0514195728029747E+09 + z*(1.3426914497246322E+08 + z*(8.7603898435731798E+07 + z*(-4.1881946843906553E+06 + z*(-8.1634960962672008E+05 + z*(5.9790206615067997E+04 + z*(5.9247537039895724E+03 + z*(-6.0136491467620624E+02 + z*(-3.4454233206790519E+01 + z*(4.4887924763186051E+00 + z*(4.8690877516313513E-01))))))))))))))))));
-    ker[10] = 1.5192201717462528E+15 + z*(-1.1495095100701465E+15 + z*(3.0749346493041219E+14 + z*(-1.6051737468109510E+13 + z*(-8.5034969223850703E+12 + z*(1.6077602129635625E+12 + z*(2.7074695075992649E+10 + z*(-3.4690551711738396E+10 + z*(2.1955280943510208E+09 + z*(3.7992777991069216E+08 + z*(-5.1437033863736227E+07 + z*(-1.9493507810665092E+06 + z*(6.4713900469564367E+05 + z*(-6.9607049368128291E+03 + z*(-5.5988835070734467E+03 + z*(2.4282489356694586E+02 + z*(3.6027670086257579E+01 + z*(-2.6327085361651021E+00 + z*(-1.8551116028617770E-01))))))))))))))))));
-    ker[11] = 2.3514830376056538E+14 + z*(-2.6313738449330159E+14 + z*(1.2398425545001659E+14 + z*(-2.9692333044160082E+13 + z*(2.7770431049857896E+12 + z*(3.5431628074580896E+11 + z*(-1.2575562817884555E+11 + z*(9.2011130753567543E+09 + z*(1.2447304828590808E+09 + z*(-2.6599346104854536E+08 + z*(4.9278861005789889E+06 + z*(2.9738539818831389E+06 + z*(-2.5133289627502396E+05 + z*(-1.5834783935893831E+04 + z*(3.1998292349030621E+03 + z*(-1.5850195971204462E+01 + z*(-2.4769863695455662E+01 + z*(1.0918739406714428E+00 + z*(1.3847580354973169E-01))))))))))))))))));
-    ker[12] = 1.6364575388763035E+13 + z*(-2.5606844387131062E+13 + z*(1.7953384130753676E+13 + z*(-7.3004206720038701E+12 + z*(1.8300700858999678E+12 + z*(-2.6044432099084848E+11 + z*(8.1552898137788668E+09 + z*(4.3997172592879610E+09 + z*(-8.3553955857879233E+08 + z*(3.5831818968908392E+07 + z*(8.4025485831489991E+06 + z*(-1.3262878384774840E+06 + z*(1.7117057951236206E+04 + z*(1.3124692974990443E+04 + z*(-1.1059926481090836E+03 + z*(-4.8897392545563044E+01 + z*(1.2149431128889342E+01 + z*(-2.5844238963842503E-01 + z*(-7.6732227735133926E-02))))))))))))))))));
-    ker[13] = 4.0355760945670026E+11 + z*(-8.7873753526056299E+11 + z*(8.8097248605448950E+11 + z*(-5.3565169504010022E+11 + z*(2.1822836608899567E+11 + z*(-6.1673589426039429E+10 + z*(1.1882638725190889E+10 + z*(-1.3744578972813025E+09 + z*(2.8723226058761366E+07 + z*(2.0677283564896725E+07 + z*(-3.6415998560990733E+06 + z*(1.8372552162922107E+05 + z*(2.6333352581335013E+04 + z*(-4.9454295091588992E+03 + z*(1.7684013881079576E+02 + z*(3.4150562973753665E+01 + z*(-4.1901615115388706E+00 + z*(1.2680123888735934E-04 + z*(3.2481254845378803E-02))))))))))))))))));
-    ker[14] = 2.0744705928579524E+09 + z*(-6.6499571180086451E+09 + z*(9.9060026035198040E+09 + z*(-9.0890343524593849E+09 + z*(5.7342804054544210E+09 + z*(-2.6255609052371716E+09 + z*(8.9490403680928278E+08 + z*(-2.2785946180651844E+08 + z*(4.2281234059838109E+07 + z*(-5.2176984975075833E+06 + z*(2.6983804209473461E+05 + z*(4.0077190107319519E+04 + z*(-1.0378455846609291E+04 + z*(8.9083493794871868E+02 + z*(1.8283698123134819E+01 + z*(-1.1938504430698943E+01 + z*(9.8067695636810759E-01 + z*(2.5444206395526567E-02 + z*(-1.0459995814341391E-02))))))))))))))))));
-    ker[15] = 3.6434551345571183E+05 + z*(-2.2576246485480373E+06 + z*(6.3730995546265030E+06 + z*(-1.0896915393078227E+07 + z*(1.2655725616100591E+07 + z*(-1.0609303958036322E+07 + z*(6.6544809363384554E+06 + z*(-3.1906872142825015E+06 + z*(1.1821527096621762E+06 + z*(-3.3854610744279937E+05 + z*(7.3893334077307401E+04 + z*(-1.1778892113376129E+04 + z*(1.2019749667911419E+03 + z*(-3.1189837631106176E+01 + z*(-1.2975319073977776E+01 + z*(2.3155118723150525E+00 + z*(-1.5401723756214594E-01 + z*(-1.1808834826225629E-02 + z*(1.0197234627511919E-04))))))))))))))))));
-  } else
-    printf("width not implemented!\n");
+// Code generated by gen_all_horner_C_code.m in finufft/devel\n// Author: Alex Barnett.
+// (C) 2018, The Simons Foundation, Inc.\n
+if (w == 2) {
+  ker[0] = 4.5147043243215315E+01 +
+           z * (5.7408070938221300E+01 +
+                z * (-1.8395117920046484E+00 +
+                     z * (-2.0382426253182082E+01 +
+                          z * (-2.0940804433577420E+00 + z * (3.1328044596872568E+00)))));
+  ker[1] = 4.5147043243215300E+01 +
+           z * (-5.7408070938221293E+01 +
+                z * (-1.8395117920046560E+00 +
+                     z * (2.0382426253182086E+01 + z * (-2.0940804433577389E+00 +
+                                                        z * (-3.1328044596872520E+00)))));
+} else if (w == 3) {
+  ker[0] =
+      1.5653991189315119E+02 +
+      z * (3.1653018869611077E+02 +
+           z * (1.7742692790454484E+02 + z * (-1.5357716116473156E+01 +
+                                              z * (-3.7757583061523668E+01 +
+                                                   z * (-3.9654011076088804E+00 +
+                                                        z * (3.3694352031960215E+00))))));
+  ker[1] = 8.8006872410780295E+02 +
+           z * (7.4325702843759617E-14 +
+                z * (-3.3149255274727801E+02 +
+                     z * (9.5071486252033243E-15 +
+                          z * (5.3222970968867315E+01 +
+                               z * (1.8062124448285358E-13 +
+                                    z * (-4.8817394017825064E+00))))));
+  ker[2] =
+      1.5653991189967152E+02 +
+      z * (-3.1653018868907071E+02 +
+           z * (1.7742692791117119E+02 +
+                z * (1.5357716122720193E+01 +
+                     z * (-3.7757583054647384E+01 +
+                          z * (3.9654011139270540E+00 + z * (3.3694352094301756E+00))))));
+} else if (w == 4) {
+  ker[0] = 5.4284366850213200E+02 +
+           z * (1.4650917259256939E+03 +
+                z * (1.4186910680718345E+03 +
+                     z * (5.1133995502497419E+02 +
+                          z * (-4.8293622641174039E+01 +
+                               z * (-7.8386867802392288E+01 +
+                                    z * (-1.0039212571700894E+01 +
+                                         z * (4.7282853097647548E+00)))))));
+  ker[1] = 1.0073871433088398E+04 +
+           z * (6.1905285583602863E+03 +
+                z * (-1.3995339862725591E+03 +
+                     z * (-1.4191608683682996E+03 +
+                          z * (3.9393732546135226E+01 +
+                               z * (1.4918904800408930E+02 +
+                                    z * (5.0626747735616746E+00 +
+                                         z * (-9.5966330409194107E+00)))))));
+  ker[2] = 1.0073871433088396E+04 +
+           z * (-6.1905285583602881E+03 +
+                z * (-1.3995339862725598E+03 +
+                     z * (1.4191608683682998E+03 +
+                          z * (3.9393732546135816E+01 +
+                               z * (-1.4918904800408751E+02 +
+                                    z * (5.0626747735625512E+00 +
+                                         z * (9.5966330409192029E+00)))))));
+  ker[3] = 5.4284366850213223E+02 +
+           z * (-1.4650917259256937E+03 +
+                z * (1.4186910680718347E+03 +
+                     z * (-5.1133995502497424E+02 +
+                          z * (-4.8293622641174061E+01 +
+                               z * (7.8386867802392359E+01 +
+                                    z * (-1.0039212571700640E+01 +
+                                         z * (-4.7282853097647095E+00)))))));
+} else if (w == 5) {
+  ker[0] = 9.9223677575398392E+02 +
+           z * (3.0430174925083825E+03 +
+                z * (3.6092689177271222E+03 +
+                     z * (1.9990077310495396E+03 +
+                          z * (4.0071733590403869E+02 +
+                               z * (-9.1301168206167262E+01 +
+                                    z * (-5.5339722671223846E+01 +
+                                         z * (-3.3762488150353924E+00 +
+                                              z * (2.5183531846828431E+00))))))));
+  ker[1] = 3.7794697666613320E+04 +
+           z * (3.7938404259811403E+04 +
+                z * (7.7501368899498666E+03 +
+                     z * (-3.8875294641277296E+03 +
+                          z * (-1.5861137916762602E+03 +
+                               z * (1.2316471075214675E+02 +
+                                    z * (1.1960590540261879E+02 +
+                                         z * (2.2839981872948751E+00 +
+                                              z * (-5.3664382310917826E+00))))))));
+  ker[2] = 9.8715771010760494E+04 +
+           z * (-1.1842989705877139E-11 +
+                z * (-2.2704627332475000E+04 +
+                     z * (9.7116927320010791E-12 +
+                          z * (2.3839858699098645E+03 +
+                               z * (2.0698495299948402E-11 +
+                                    z * (-1.5249941358311668E+02 +
+                                         z * (7.1884725699454154E-12 +
+                                              z * (6.6969190369423464E+00))))))));
+  ker[3] = 3.7794697666613283E+04 +
+           z * (-3.7938404259811381E+04 +
+                z * (7.7501368899498730E+03 +
+                     z * (3.8875294641277369E+03 +
+                          z * (-1.5861137916762643E+03 +
+                               z * (-1.2316471075214508E+02 +
+                                    z * (1.1960590540262307E+02 +
+                                         z * (-2.2839981872943818E+00 +
+                                              z * (-5.3664382311089387E+00))))))));
+  ker[4] = 9.9223677575398403E+02 +
+           z * (-3.0430174925083829E+03 +
+                z * (3.6092689177271218E+03 +
+                     z * (-1.9990077310495412E+03 +
+                          z * (4.0071733590403909E+02 +
+                               z * (9.1301168206167233E+01 +
+                                    z * (-5.5339722671223605E+01 +
+                                         z * (3.3762488150341459E+00 +
+                                              z * (2.5183531846825233E+00))))))));
+} else if (w == 6) {
+  ker[0] = 2.0553833234911876E+03 +
+           z * (7.1269776034442639E+03 +
+                z * (1.0023404568475091E+04 +
+                     z * (7.2536109410387417E+03 +
+                          z * (2.7021878300949752E+03 +
+                               z * (3.2120291706547636E+02 +
+                                    z * (-1.2051267090537374E+02 +
+                                         z * (-4.5977202613350237E+01 +
+                                              z * (-1.5631081288842275E+00 +
+                                                   z * (1.7872002109942835E+00)))))))));
+  ker[1] = 1.5499537739913128E+05 +
+           z * (2.0581923258843314E+05 +
+                z * (9.0916650498360192E+04 +
+                     z * (4.8347162752602981E+03 +
+                          z * (-7.8773465553972646E+03 +
+                               z * (-1.8229189469936762E+03 +
+                                    z * (2.2400507411399673E+02 +
+                                         z * (1.1536880606853076E+02 +
+                                              z * (7.1037430591266115E-01 +
+                                                   z * (-4.0452381056342732E+00)))))))));
+  ker[2] = 8.1177907023291115E+05 +
+           z * (3.1559612614917674E+05 +
+                z * (-1.0095927514054619E+05 +
+                     z * (-5.0512736602018522E+04 +
+                          z * (5.2105876478342780E+03 +
+                               z * (3.7928113414429808E+03 +
+                                    z * (-1.2506575852541796E+02 +
+                                         z * (-1.7819720186493959E+02 +
+                                              z * (-6.9838401121429056E-02 +
+                                                   z * (5.8969107681874027E+00)))))))));
+  ker[3] = 8.1177907023291173E+05 +
+           z * (-3.1559612614917627E+05 +
+                z * (-1.0095927514054628E+05 +
+                     z * (5.0512736602018478E+04 +
+                          z * (5.2105876478343343E+03 +
+                               z * (-3.7928113414427025E+03 +
+                                    z * (-1.2506575852521925E+02 +
+                                         z * (1.7819720186497622E+02 +
+                                              z * (-6.9838401186476856E-02 +
+                                                   z * (-5.8969107680596977E+00)))))))));
+  ker[4] = 1.5499537739913136E+05 +
+           z * (-2.0581923258843317E+05 +
+                z * (9.0916650498360177E+04 +
+                     z * (-4.8347162752603008E+03 +
+                          z * (-7.8773465553972710E+03 +
+                               z * (1.8229189469937312E+03 +
+                                    z * (2.2400507411398695E+02 +
+                                         z * (-1.1536880606854736E+02 +
+                                              z * (7.1037430589285400E-01 +
+                                                   z * (4.0452381056603945E+00)))))))));
+  ker[5] = 2.0553833235005691E+03 +
+           z * (-7.1269776034341394E+03 +
+                z * (1.0023404568484635E+04 +
+                     z * (-7.2536109410297540E+03 +
+                          z * (2.7021878301048723E+03 +
+                               z * (-3.2120291705638243E+02 +
+                                    z * (-1.2051267089640181E+02 +
+                                         z * (4.5977202622148909E+01 +
+                                              z * (-1.5631081203754575E+00 +
+                                                   z * (-1.7872002036966905E+00)))))))));
+} else if (w == 7) {
+  ker[0] =
+      3.9948351830487481E+03 +
+      z * (1.5290160332974696E+04 +
+           z * (2.4458227486779251E+04 +
+                z * (2.1166189345881645E+04 +
+                     z * (1.0542795672344864E+04 +
+                          z * (2.7903491906228419E+03 +
+                               z * (1.6069721418053300E+02 +
+                                    z * (-1.2289277373867256E+02 +
+                                         z * (-3.2270164914249058E+01 +
+                                              z * (-1.4761409685186277E-01 +
+                                                   z * (1.0330620799145493E+00))))))))));
+  ker[1] =
+      5.4715865608590771E+05 +
+      z * (8.7628248584320408E+05 +
+           z * (5.3904618484139396E+05 +
+                z * (1.3382732160223130E+05 +
+                     z * (-7.0739172265098678E+03 +
+                          z * (-1.0975382873973093E+04 +
+                               z * (-1.5518707872251393E+03 +
+                                    z * (2.8583630927743314E+02 +
+                                         z * (9.1892112257581346E+01 +
+                                              z * (-9.1862771280377487E-01 +
+                                                   z * (-2.6798144968400117E+00))))))))));
+  ker[2] =
+      5.0196413492771760E+06 +
+      z * (3.4421061790934438E+06 +
+           z * (2.4315566181017534E+05 +
+                z * (-3.3113450969689694E+05 +
+                     z * (-6.5563293056049893E+04 +
+                          z * (1.3656979541144799E+04 +
+                               z * (4.3634273936642621E+03 +
+                                    z * (-2.8318194617327981E+02 +
+                                         z * (-1.6710678096334209E+02 +
+                                              z * (1.2845147741777752E+00 +
+                                                   z * (4.4142511558139139E+00))))))))));
+  ker[3] =
+      9.8206709220713247E+06 +
+      z * (-2.6908159596373561E-10 +
+           z * (-1.6133959371974322E+06 +
+                z * (6.9013724510092140E-10 +
+                     z * (1.2429734005960064E+05 +
+                          z * (7.7346408577822045E-10 +
+                               z * (-5.9891976420595174E+03 +
+                                    z * (6.9043515551118249E-10 +
+                                         z * (2.0317049305432383E+02 +
+                                              z * (5.6547359492808854E-10 +
+                                                   z * (-5.1799254920720621E+00))))))))));
+  ker[4] =
+      5.0196413492771825E+06 +
+      z * (-3.4421061790934461E+06 +
+           z * (2.4315566181017453E+05 +
+                z * (3.3113450969689724E+05 +
+                     z * (-6.5563293056049602E+04 +
+                          z * (-1.3656979541143772E+04 +
+                               z * (4.3634273936642730E+03 +
+                                    z * (2.8318194617392436E+02 +
+                                         z * (-1.6710678096383771E+02 +
+                                              z * (-1.2845147728310689E+00 +
+                                                   z * (4.4142511545643943E+00))))))))));
+  ker[5] =
+      5.4715865608590783E+05 +
+      z * (-8.7628248584320408E+05 +
+           z * (5.3904618484139396E+05 +
+                z * (-1.3382732160223136E+05 +
+                     z * (-7.0739172265098332E+03 +
+                          z * (1.0975382873973256E+04 +
+                               z * (-1.5518707872251064E+03 +
+                                    z * (-2.8583630927760140E+02 +
+                                         z * (9.1892112257416159E+01 +
+                                              z * (9.1862771293147971E-01 +
+                                                   z * (-2.6798144967872908E+00))))))))));
+  ker[6] =
+      3.9948351830642519E+03 +
+      z * (-1.5290160332958067E+04 +
+           z * (2.4458227486795113E+04 +
+                z * (-2.1166189345866893E+04 +
+                     z * (1.0542795672361213E+04 +
+                          z * (-2.7903491906078298E+03 +
+                               z * (1.6069721419533221E+02 +
+                                    z * (1.2289277375319763E+02 +
+                                         z * (-3.2270164900224913E+01 +
+                                              z * (1.4761410890866353E-01 +
+                                                   z * (1.0330620914446063E+00))))))))));
+} else if (w == 8) {
+  ker[0] =
+      7.3898000697447915E+03 +
+      z * (3.0719636811267599E+04 +
+           z * (5.4488498478251728E+04 +
+                z * (5.3926359802542116E+04 +
+                     z * (3.2444118016247590E+04 +
+                          z * (1.1864306345505294E+04 +
+                               z * (2.2812256770903232E+03 +
+                                    z * (8.5503535636821422E+00 +
+                                         z * (-1.0230637348345023E+02 +
+                                              z * (-1.9200143062947848E+01 +
+                                                   z * (3.7894993760177598E-01 +
+                                                        z * (6.2056700181418578E-01)))))))))));
+  ker[1] =
+      1.7297637497600035E+06 +
+      z * (3.1853145713323927E+06 +
+           z * (2.4101183255475131E+06 +
+                z * (9.0469037926849292E+05 +
+                     z * (1.3079802224392134E+05 +
+                          z * (-2.2700360645707988E+04 +
+                               z * (-1.1569135767377773E+04 +
+                                    z * (-9.7513976461238224E+02 +
+                                         z * (2.8246898554269114E+02 +
+                                              z * (6.1692257626706223E+01 +
+                                                   z * (-1.7334408836731494E+00 +
+                                                        z * (-1.5765637811248883E+00)))))))))));
+  ker[2] =
+      2.5578341605285794E+07 +
+      z * (2.3797981861403696E+07 +
+           z * (6.4554051283428287E+06 +
+                z * (-6.0897036277696118E+05 +
+                     z * (-5.8652889370129269E+05 +
+                          z * (-5.0713607251414309E+04 +
+                               z * (2.0942387020798891E+04 +
+                                    z * (3.8242995179171526E+03 +
+                                         z * (-3.8638201738139219E+02 +
+                                              z * (-1.2981109187842989E+02 +
+                                                   z * (2.5271184057877303E+00 +
+                                                        z * (2.8789780707929218E+00)))))))))));
+  ker[3] =
+      8.4789650417103335E+07 +
+      z * (2.4569731244678464E+07 +
+           z * (-8.9200440393090546E+06 +
+                z * (-3.0743852105799988E+06 +
+                     z * (4.2333306008151924E+05 +
+                          z * (1.8308704458211688E+05 +
+                               z * (-1.1661592834945191E+04 +
+                                    z * (-6.9201295567267280E+03 +
+                                         z * (1.9106407993320320E+02 +
+                                              z * (1.8681284210471688E+02 +
+                                                   z * (-1.2600963971824484E+00 +
+                                                        z * (-3.8435871899601382E+00)))))))))));
+  ker[4] =
+      8.4789650417103350E+07 +
+      z * (-2.4569731244678471E+07 +
+           z * (-8.9200440393090583E+06 +
+                z * (3.0743852105800058E+06 +
+                     z * (4.2333306008152053E+05 +
+                          z * (-1.8308704458210632E+05 +
+                               z * (-1.1661592834940149E+04 +
+                                    z * (6.9201295567248662E+03 +
+                                         z * (1.9106407993289886E+02 +
+                                              z * (-1.8681284209654376E+02 +
+                                                   z * (-1.2600963917834651E+00 +
+                                                        z * (3.8435871898077045E+00)))))))))));
+  ker[5] =
+      2.5578341605285816E+07 +
+      z * (-2.3797981861403704E+07 +
+           z * (6.4554051283428324E+06 +
+                z * (6.0897036277696711E+05 +
+                     z * (-5.8652889370128722E+05 +
+                          z * (5.0713607251413123E+04 +
+                               z * (2.0942387020801420E+04 +
+                                    z * (-3.8242995179155446E+03 +
+                                         z * (-3.8638201738492717E+02 +
+                                              z * (1.2981109187880142E+02 +
+                                                   z * (2.5271184069685657E+00 +
+                                                        z * (-2.8789780683540003E+00)))))))))));
+  ker[6] =
+      1.7297637497600049E+06 +
+      z * (-3.1853145713323941E+06 +
+           z * (2.4101183255475126E+06 +
+                z * (-9.0469037926849339E+05 +
+                     z * (1.3079802224392109E+05 +
+                          z * (2.2700360645707628E+04 +
+                               z * (-1.1569135767377924E+04 +
+                                    z * (9.7513976461209836E+02 +
+                                         z * (2.8246898554219217E+02 +
+                                              z * (-6.1692257626845532E+01 +
+                                                   z * (-1.7334408840526812E+00 +
+                                                        z * (1.5765637814488063E+00)))))))))));
+  ker[7] =
+      7.3898000697447915E+03 +
+      z * (-3.0719636811267606E+04 +
+           z * (5.4488498478251728E+04 +
+                z * (-5.3926359802542138E+04 +
+                     z * (3.2444118016247590E+04 +
+                          z * (-1.1864306345505294E+04 +
+                               z * (2.2812256770903286E+03 +
+                                    z * (-8.5503535637013552E+00 +
+                                         z * (-1.0230637348345138E+02 +
+                                              z * (1.9200143062947120E+01 +
+                                                   z * (3.7894993760636758E-01 +
+                                                        z * (-6.2056700182490354E-01)))))))))));
+} else if (w == 9) {
+  ker[0] =
+      1.3136365370186100E+04 +
+      z * (5.8623313038274340E+04 +
+           z * (1.1335001341875963E+05 +
+                z * (1.2489113703229747E+05 +
+                     z * (8.6425493435991244E+04 +
+                          z * (3.8657354724013814E+04 +
+                               z * (1.0779131453134638E+04 +
+                                    z * (1.4992527030548456E+03 +
+                                         z * (-7.9857427421129714E+01 +
+                                              z * (-7.1572272057937070E+01 +
+                                                   z * (-9.8886360698074700E+00 +
+                                                        z * (5.4050464454251190E-01)))))))))));
+  ker[1] =
+      5.0196413492771806E+06 +
+      z * (1.0326318537280345E+07 +
+           z * (9.0726133144784812E+06 +
+                z * (4.3035547171861930E+06 +
+                     z * (1.0891182836653308E+06 +
+                          z * (7.9936390113331305E+04 +
+                               z * (-3.3466718311300596E+04 +
+                                    z * (-9.7024371533891372E+03 +
+                                         z * (-4.0585588534807385E+02 +
+                                              z * (2.2785637019511205E+02 +
+                                                   z * (3.5359026949867051E+01 +
+                                                        z * (-1.7215219046118406E+00)))))))))));
+  ker[2] =
+      1.1303327711722563E+08 +
+      z * (1.2898448324824864E+08 +
+           z * (5.3501544534038112E+07 +
+                z * (6.3021978510598792E+06 +
+                     z * (-2.0713033564200639E+06 +
+                          z * (-7.0458265546791907E+05 +
+                               z * (-1.3245366619006139E+04 +
+                                    z * (2.3216330734057381E+04 +
+                                         z * (2.6054813773472697E+03 +
+                                              z * (-3.9109820765665262E+02 +
+                                                   z * (-8.5251867715709949E+01 +
+                                                        z * (2.8631741366044747E+00)))))))))));
+  ker[3] =
+      5.8225443924996686E+08 +
+      z * (3.0522863709830385E+08 +
+           z * (-2.6789524644146336E+05 +
+                z * (-2.6014941986659057E+07 +
+                     z * (-2.8994941183506218E+06 +
+                          z * (1.0151095605715880E+06 +
+                               z * (1.8238470515353698E+05 +
+                                    z * (-2.3465262819040818E+04 +
+                                         z * (-6.1806593581075495E+03 +
+                                              z * (3.3597424711470910E+02 +
+                                                   z * (1.4285748012617628E+02 +
+                                                        z * (-2.3817977532177874E+00)))))))))));
+  ker[4] =
+      9.7700272582690656E+08 +
+      z * (-3.9398045056223735E-08 +
+           z * (-1.2483923718899371E+08 +
+                z * (6.0417403157325170E-08 +
+                     z * (7.5905338661205899E+06 +
+                          z * (1.2138090419648379E-07 +
+                               z * (-2.9285656292977190E+05 +
+                                    z * (5.3299736484284360E-08 +
+                                         z * (8.0679596874001718E+03 +
+                                              z * (1.0596763818009852E-07 +
+                                                   z * (-1.6935269668779691E+02 +
+                                                        z * (3.3486663946780986E-08)))))))))));
+  ker[5] =
+      5.8225443924996758E+08 +
+      z * (-3.0522863709830391E+08 +
+           z * (-2.6789524644172983E+05 +
+                z * (2.6014941986659389E+07 +
+                     z * (-2.8994941183505375E+06 +
+                          z * (-1.0151095605717725E+06 +
+                               z * (1.8238470515350526E+05 +
+                                    z * (2.3465262819251962E+04 +
+                                         z * (-6.1806593581869265E+03 +
+                                              z * (-3.3597424723359080E+02 +
+                                                   z * (1.4285748010331625E+02 +
+                                                        z * (2.3817977739486871E+00)))))))))));
+  ker[6] =
+      1.1303327711722568E+08 +
+      z * (-1.2898448324824864E+08 +
+           z * (5.3501544534038112E+07 +
+                z * (-6.3021978510598652E+06 +
+                     z * (-2.0713033564200667E+06 +
+                          z * (7.0458265546794771E+05 +
+                               z * (-1.3245366619000662E+04 +
+                                    z * (-2.3216330734049119E+04 +
+                                         z * (2.6054813773147021E+03 +
+                                              z * (3.9109820766854079E+02 +
+                                                   z * (-8.5251867711661305E+01 +
+                                                        z * (-2.8631741207515398E+00)))))))))));
+  ker[7] =
+      5.0196413492772207E+06 +
+      z * (-1.0326318537280388E+07 +
+           z * (9.0726133144785129E+06 +
+                z * (-4.3035547171862079E+06 +
+                     z * (1.0891182836653353E+06 +
+                          z * (-7.9936390113331567E+04 +
+                               z * (-3.3466718311299621E+04 +
+                                    z * (9.7024371533890644E+03 +
+                                         z * (-4.0585588535363172E+02 +
+                                              z * (-2.2785637019009673E+02 +
+                                                   z * (3.5359026944299828E+01 +
+                                                        z * (1.7215219091086191E+00)))))))))));
+  ker[8] =
+      1.3136365370186135E+04 +
+      z * (-5.8623313038274347E+04 +
+           z * (1.1335001341875960E+05 +
+                z * (-1.2489113703229751E+05 +
+                     z * (8.6425493435991288E+04 +
+                          z * (-3.8657354724013821E+04 +
+                               z * (1.0779131453134616E+04 +
+                                    z * (-1.4992527030548747E+03 +
+                                         z * (-7.9857427421126204E+01 +
+                                              z * (7.1572272057939983E+01 +
+                                                   z * (-9.8886360698207305E+00 +
+                                                        z * (-5.4050464455680780E-01)))))))))));
+} else if (w == 10) {
+  ker[0] =
+      2.2594586605749264E+04 +
+      z * (1.0729981697645642E+05 +
+           z * (2.2340399734184606E+05 +
+                z * (2.6917433004353486E+05 +
+                     z * (2.0818422772177903E+05 +
+                          z * (1.0781139496011091E+05 +
+                               z * (3.7380102688153558E+04 +
+                                    z * (8.1238936393894646E+03 +
+                                         z * (7.8515926628982663E+02 +
+                                              z * (-1.0147176570537010E+02 +
+                                                   z * (-4.3161545259389186E+01 +
+                                                        z * (-4.2916172038214198E+00 +
+                                                             z * (3.5357495063798372E-01))))))))))));
+  ker[1] =
+      1.3595989066786593E+07 +
+      z * (3.0651490267742988E+07 +
+           z * (3.0258214643190462E+07 +
+                z * (1.6875651476661228E+07 +
+                     z * (5.6084730690362519E+06 +
+                          z * (9.9202615851199068E+05 +
+                               z * (1.2716675000355666E+04 +
+                                    z * (-3.4872365530450072E+04 +
+                                         z * (-6.6607899119372642E+03 +
+                                              z * (-3.5304284185385157E+01 +
+                                                   z * (1.5498490981579428E+02 +
+                                                        z * (1.7402146071148604E+01 +
+                                                             z * (-1.2828127001656939E+00))))))))))));
+  ker[2] =
+      4.4723032442444897E+08 +
+      z * (5.9387966085130465E+08 +
+           z * (3.1512411458738232E+08 +
+                z * (7.4664745481963441E+07 +
+                     z * (1.4435118192351763E+06 +
+                          z * (-3.3266265543962116E+06 +
+                               z * (-6.2163527451774501E+05 +
+                                    z * (2.3913680325196314E+04 +
+                                         z * (2.0167398338513311E+04 +
+                                              z * (1.3576976854876134E+03 +
+                                                   z * (-3.1771250774232175E+02 +
+                                                        z * (-4.7947588069135868E+01 +
+                                                             z * (2.4090120576065592E+00))))))))))));
+  ker[3] =
+      3.3781755837397518E+09 +
+      z * (2.4434902657508330E+09 +
+           z * (4.3618276932319808E+08 +
+                z * (-9.5882157211118385E+07 +
+                     z * (-4.0063869969544649E+07 +
+                          z * (-4.8557049011479173E+05 +
+                               z * (1.4157962667184104E+06 +
+                                    z * (1.2428850301830019E+05 +
+                                         z * (-2.8951401344519112E+04 +
+                                              z * (-4.3921059353471856E+03 +
+                                                   z * (3.7215448796427023E+02 +
+                                                        z * (9.2697698088029625E+01 +
+                                                             z * (-2.6448903316232837E+00))))))))))));
+  ker[4] =
+      8.6836783895849819E+09 +
+      z * (2.0073077861288922E+09 +
+           z * (-7.8178848450497293E+08 +
+                z * (-2.0622994435532519E+08 +
+                     z * (3.2803674392747045E+07 +
+                          z * (1.0176155522772279E+07 +
+                               z * (-8.4419693137680157E+05 +
+                                    z * (-3.2158255329716846E+05 +
+                                         z * (1.4622828142848679E+04 +
+                                              z * (7.3232085271125388E+03 +
+                                                   z * (-1.7181762832770994E+02 +
+                                                        z * (-1.2821427596894478E+02 +
+                                                             z * (1.1811556247055470E+00))))))))))));
+  ker[5] =
+      8.6836783895849762E+09 +
+      z * (-2.0073077861288943E+09 +
+           z * (-7.8178848450497019E+08 +
+                z * (2.0622994435532743E+08 +
+                     z * (3.2803674392746095E+07 +
+                          z * (-1.0176155522772269E+07 +
+                               z * (-8.4419693137743860E+05 +
+                                    z * (3.2158255329951923E+05 +
+                                         z * (1.4622828143544031E+04 +
+                                              z * (-7.3232085273978546E+03 +
+                                                   z * (-1.7181763036843782E+02 +
+                                                        z * (1.2821427705670308E+02 +
+                                                             z * (1.1811572031372566E+00))))))))))));
+  ker[6] =
+      3.3781755837397494E+09 +
+      z * (-2.4434902657508330E+09 +
+           z * (4.3618276932319826E+08 +
+                z * (9.5882157211118177E+07 +
+                     z * (-4.0063869969546899E+07 +
+                          z * (4.8557049011678610E+05 +
+                               z * (1.4157962667189445E+06 +
+                                    z * (-1.2428850301867779E+05 +
+                                         z * (-2.8951401346900999E+04 +
+                                              z * (4.3921059367737662E+03 +
+                                                   z * (3.7215448789408123E+02 +
+                                                        z * (-9.2697698297776569E+01 +
+                                                             z * (-2.6448918574427118E+00))))))))))));
+  ker[7] =
+      4.4723032442444897E+08 +
+      z * (-5.9387966085130453E+08 +
+           z * (3.1512411458738232E+08 +
+                z * (-7.4664745481963515E+07 +
+                     z * (1.4435118192351642E+06 +
+                          z * (3.3266265543963453E+06 +
+                               z * (-6.2163527451771160E+05 +
+                                    z * (-2.3913680325277423E+04 +
+                                         z * (2.0167398338398041E+04 +
+                                              z * (-1.3576976854043962E+03 +
+                                                   z * (-3.1771250773692140E+02 +
+                                                        z * (4.7947588093524907E+01 +
+                                                             z * (2.4090119348381340E+00))))))))))));
+  ker[8] =
+      1.3595989066786474E+07 +
+      z * (-3.0651490267742816E+07 +
+           z * (3.0258214643190313E+07 +
+                z * (-1.6875651476661161E+07 +
+                     z * (5.6084730690362034E+06 +
+                          z * (-9.9202615851196018E+05 +
+                               z * (1.2716675000340010E+04 +
+                                    z * (3.4872365530457188E+04 +
+                                         z * (-6.6607899119505255E+03 +
+                                              z * (3.5304284185385157E+01 +
+                                                   z * (1.5498490982186786E+02 +
+                                                        z * (-1.7402146074502035E+01 +
+                                                             z * (-1.2828126997546137E+00))))))))))));
+  ker[9] =
+      2.2594586605749344E+04 +
+      z * (-1.0729981697645638E+05 +
+           z * (2.2340399734184548E+05 +
+                z * (-2.6917433004353428E+05 +
+                     z * (2.0818422772177853E+05 +
+                          z * (-1.0781139496011072E+05 +
+                               z * (3.7380102688153442E+04 +
+                                    z * (-8.1238936393894255E+03 +
+                                         z * (7.8515926628967964E+02 +
+                                              z * (1.0147176570550941E+02 +
+                                                   z * (-4.3161545259547800E+01 +
+                                                        z * (4.2916172038452141E+00 +
+                                                             z * (3.5357495065519018E-01))))))))))));
+} else if (w == 11) {
+  ker[0] =
+      3.7794653219809625E+04 +
+      z * (1.8969206922085886E+05 +
+           z * (4.2138380313901440E+05 +
+                z * (5.4814313598122005E+05 +
+                     z * (4.6495183529254980E+05 +
+                          z * (2.7021781043532980E+05 +
+                               z * (1.0933249308680627E+05 +
+                                    z * (3.0203516161820498E+04 +
+                                         z * (5.1670143574922731E+03 +
+                                              z * (3.0888018539740131E+02 +
+                                                   z * (-8.3747489794189363E+01 +
+                                                        z * (-2.2640047135517630E+01 +
+                                                             z * (-1.6306382886201207E+00 +
+                                                                  z * (2.4409286933062832E-01)))))))))))));
+  ker[1] =
+      3.4782300224660739E+07 +
+      z * (8.4769319065313652E+07 +
+           z * (9.2050522922791913E+07 +
+                z * (5.8085130777589552E+07 +
+                     z * (2.3067199578027144E+07 +
+                          z * (5.6764510325100143E+06 +
+                               z * (6.9586821127987828E+05 +
+                                    z * (-3.6879059542768438E+04 +
+                                         z * (-2.8613147115372190E+04 +
+                                              z * (-3.7949446187471626E+03 +
+                                                   z * (1.1948077479405792E+02 +
+                                                        z * (9.0840898563949466E+01 +
+                                                             z * (7.3325946591320434E+00 +
+                                                                  z * (-7.8803147494205206E-01)))))))))))));
+  ker[2] =
+      1.6188020733727551E+09 +
+      z * (2.4230555767723408E+09 +
+           z * (1.5259983101266613E+09 +
+                z * (4.9484006166551048E+08 +
+                     z * (6.9832590192482382E+07 +
+                          z * (-5.5650761736748898E+06 +
+                               z * (-3.6860240321937902E+06 +
+                                    z * (-4.1141031216788280E+05 +
+                                         z * (4.3560195427081359E+04 +
+                                              z * (1.4313303204988082E+04 +
+                                                   z * (4.8528498015072080E+02 +
+                                                        z * (-2.1597187544386938E+02 +
+                                                             z * (-2.3241017682854558E+01 +
+                                                                  z * (1.6467143111023508E+00)))))))))))));
+  ker[3] =
+      1.7196758809615005E+10 +
+      z * (1.5439732722639101E+10 +
+           z * (4.7070559561237173E+09 +
+                z * (1.6222124676640952E+08 +
+                     z * (-2.2024799260683522E+08 +
+                          z * (-3.9907385617900200E+07 +
+                               z * (2.7428169457736355E+06 +
+                                    z * (1.4111389975267777E+06 +
+                                         z * (4.8438679582765450E+04 +
+                                              z * (-2.6681600235594462E+04 +
+                                                   z * (-2.5024391114755094E+03 +
+                                                        z * (3.1511229111443720E+02 +
+                                                             z * (5.1715494398901185E+01 +
+                                                                  z * (-2.1898240024594333E+00)))))))))))));
+  ker[4] =
+      6.3754384857724617E+10 +
+      z * (2.7112836839612309E+10 +
+           z * (-1.2448027572952359E+09 +
+                z * (-2.0440440381345339E+09 +
+                     z * (-1.2820270942588677E+08 +
+                          z * (7.2453390663687646E+07 +
+                               z * (8.3392008440593518E+06 +
+                                    z * (-1.5914376635331670E+06 +
+                                         z * (-2.5856630639231802E+05 +
+                                              z * (2.3856005166166615E+04 +
+                                                   z * (5.3511195318669425E+03 +
+                                                        z * (-2.4856617998395282E+02 +
+                                                             z * (-8.2673000279130790E+01 +
+                                                                  z * (1.6350121154971753E+00)))))))))))));
+  ker[5] =
+      9.7196447559193497E+10 +
+      z * (2.5609833368650835E-06 +
+           z * (-1.0161446790279301E+10 +
+                z * (9.1416457449079640E-06 +
+                     z * (5.1017181199129778E+08 +
+                          z * (1.2300109686762266E-05 +
+                               z * (-1.6402201025046850E+07 +
+                                    z * (9.4095582602103753E-06 +
+                                         z * (3.7994883866738499E+05 +
+                                              z * (8.6424601730164351E-06 +
+                                                   z * (-6.7655484107390166E+03 +
+                                                        z * (6.1683918215190516E-06 +
+                                                             z * (9.6489719151212370E+01 +
+                                                                  z * (3.1067137654111114E-06)))))))))))));
+  ker[6] =
+      6.3754384857724617E+10 +
+      z * (-2.7112836839612328E+10 +
+           z * (-1.2448027572952316E+09 +
+                z * (2.0440440381345336E+09 +
+                     z * (-1.2820270942588474E+08 +
+                          z * (-7.2453390663684472E+07 +
+                               z * (8.3392008440698013E+06 +
+                                    z * (1.5914376635379130E+06 +
+                                         z * (-2.5856630640319458E+05 +
+                                              z * (-2.3856005155895236E+04 +
+                                                   z * (5.3511195362291774E+03 +
+                                                        z * (2.4856618439352349E+02 +
+                                                             z * (-8.2673010381149226E+01 +
+                                                                  z * (-1.6350114182053190E+00)))))))))))));
+  ker[7] =
+      1.7196758809614998E+10 +
+      z * (-1.5439732722639105E+10 +
+           z * (4.7070559561237268E+09 +
+                z * (-1.6222124676640788E+08 +
+                     z * (-2.2024799260683942E+08 +
+                          z * (3.9907385617899075E+07 +
+                               z * (2.7428169457778852E+06 +
+                                    z * (-1.4111389975247320E+06 +
+                                         z * (4.8438679579510936E+04 +
+                                              z * (2.6681600234453199E+04 +
+                                                   z * (-2.5024391131167667E+03 +
+                                                        z * (-3.1511228757800421E+02 +
+                                                             z * (5.1715494328769353E+01 +
+                                                                  z * (2.1898231010467075E+00)))))))))))));
+  ker[8] =
+      1.6188020733727560E+09 +
+      z * (-2.4230555767723408E+09 +
+           z * (1.5259983101266615E+09 +
+                z * (-4.9484006166551071E+08 +
+                     z * (6.9832590192482322E+07 +
+                          z * (5.5650761736749066E+06 +
+                               z * (-3.6860240321937371E+06 +
+                                    z * (4.1141031216776522E+05 +
+                                         z * (4.3560195426766244E+04 +
+                                              z * (-1.4313303205083188E+04 +
+                                                   z * (4.8528498019392708E+02 +
+                                                        z * (2.1597187557069353E+02 +
+                                                             z * (-2.3241018024860580E+01 +
+                                                                  z * (-1.6467144498332880E+00)))))))))))));
+  ker[9] =
+      3.4782300224660769E+07 +
+      z * (-8.4769319065313682E+07 +
+           z * (9.2050522922791913E+07 +
+                z * (-5.8085130777589560E+07 +
+                     z * (2.3067199578027155E+07 +
+                          z * (-5.6764510325099993E+06 +
+                               z * (6.9586821127989423E+05 +
+                                    z * (3.6879059542750314E+04 +
+                                         z * (-2.8613147115376054E+04 +
+                                              z * (3.7949446187583080E+03 +
+                                                   z * (1.1948077480620087E+02 +
+                                                        z * (-9.0840898570046704E+01 +
+                                                             z * (7.3325946448852415E+00 +
+                                                                  z * (7.8803147565170728E-01)))))))))))));
+  ker[10] =
+      3.7794653219808984E+04 +
+      z * (-1.8969206922085711E+05 +
+           z * (4.2138380313901149E+05 +
+                z * (-5.4814313598121714E+05 +
+                     z * (4.6495183529254742E+05 +
+                          z * (-2.7021781043532846E+05 +
+                               z * (1.0933249308680571E+05 +
+                                    z * (-3.0203516161820549E+04 +
+                                         z * (5.1670143574922913E+03 +
+                                              z * (-3.0888018539728523E+02 +
+                                                   z * (-8.3747489794426258E+01 +
+                                                        z * (2.2640047135565219E+01 +
+                                                             z * (-1.6306382886460551E+00 +
+                                                                  z * (-2.4409286929438936E-01)))))))))))));
+} else if (w == 12) {
+  ker[0] =
+      6.1722991679852908E+04 +
+      z * (3.2561466099406168E+05 +
+           z * (7.6621098001581512E+05 +
+                z * (1.0657807616803218E+06 +
+                     z * (9.7829638830158755E+05 +
+                          z * (6.2536876825114002E+05 +
+                               z * (2.8527714307528478E+05 +
+                                    z * (9.2873647411234080E+04 +
+                                         z * (2.0817947751046438E+04 +
+                                              z * (2.7986023314783361E+03 +
+                                                   z * (6.7849020474048089E+01 +
+                                                        z * (-5.4577020998836872E+01 +
+                                                             z * (-1.0538365872268786E+01 +
+                                                                  z * (-4.6087004144309118E-01 +
+                                                                       z * (1.0091617602357156E-01))))))))))))));
+  ker[1] =
+      8.4789650417103648E+07 +
+      z * (2.2112758120210618E+08 +
+           z * (2.6026568260310286E+08 +
+                z * (1.8144472126890984E+08 +
+                     z * (8.2222351241519913E+07 +
+                          z * (2.4702814073680203E+07 +
+                               z * (4.6266378435690766E+06 +
+                                    z * (3.6630046787425119E+05 +
+                                         z * (-5.5660303410315042E+04 +
+                                              z * (-1.9404411093655592E+04 +
+                                                   z * (-1.7921351308204744E+03 +
+                                                        z * (1.3637112867242237E+02 +
+                                                             z * (4.6577222488645518E+01 +
+                                                                  z * (2.5969759128998060E+00 +
+                                                                       z * (-4.1640313379289134E-01))))))))))))));
+  ker[2] =
+      5.4431675199498701E+09 +
+      z * (8.9911609880089817E+09 +
+           z * (6.4524338253008652E+09 +
+                z * (2.5524827004349842E+09 +
+                     z * (5.5676911894064474E+08 +
+                          z * (4.1488431554846466E+07 +
+                               z * (-1.0665598090790771E+07 +
+                                    z * (-3.1271047224730137E+06 +
+                                         z * (-1.9519783923444615E+05 +
+                                              z * (4.3922625000519314E+04 +
+                                                   z * (8.4980694686552797E+03 +
+                                                        z * (4.5513616580246023E+01 +
+                                                             z * (-1.2606964198473415E+02 +
+                                                                  z * (-9.6946932216381381E+00 +
+                                                                       z * (9.5655538149652608E-01))))))))))))));
+  ker[3] =
+      7.8788892335272232E+10 +
+      z * (8.3059508064200943E+10 +
+           z * (3.3729904113826820E+10 +
+                z * (5.2112383911371660E+09 +
+                     z * (-4.8739037675427330E+08 +
+                          z * (-2.9274790542418826E+08 +
+                               z * (-2.6048960239891130E+07 +
+                                    z * (4.8612412939252760E+06 +
+                                         z * (1.0804817251338551E+06 +
+                                              z * (-7.6450317451901383E+03 +
+                                                   z * (-1.9742624859769410E+04 +
+                                                        z * (-1.1174001367986359E+03 +
+                                                             z * (2.1881091668968099E+02 +
+                                                                  z * (2.4990041962121211E+01 +
+                                                                       z * (-1.4751933188367841E+00))))))))))))));
+  ker[4] =
+      4.0355760945670044E+11 +
+      z * (2.3965569143469864E+11 +
+           z * (2.8555202212474091E+10 +
+                z * (-1.0268350564014645E+10 +
+                     z * (-2.7153428193078227E+09 +
+                          z * (1.0742154109191516E+08 +
+                               z * (9.1597254427317813E+07 +
+                                    z * (3.3820440907796426E+06 +
+                                         z * (-1.8264985852555393E+06 +
+                                              z * (-1.5273911974273989E+05 +
+                                                   z * (2.4620674845030797E+04 +
+                                                        z * (3.2018769312434206E+03 +
+                                                             z * (-2.3273399614976032E+02 +
+                                                                  z * (-4.6013909139329137E+01 +
+                                                                       z * (1.4545209841258784E+00))))))))))))));
+  ker[5] =
+      8.8071481911347949E+11 +
+      z * (1.6939286803305212E+11 +
+           z * (-6.8998572040731537E+10 +
+                z * (-1.4763245309081306E+10 +
+                     z * (2.5627633609246106E+09 +
+                          z * (6.2185168968032193E+08 +
+                               z * (-5.9794495983264342E+07 +
+                                    z * (-1.6880127953704204E+07 +
+                                         z * (9.7602844968061335E+05 +
+                                              z * (3.3223441458516393E+05 +
+                                                   z * (-1.1676544851227827E+04 +
+                                                        z * (-5.0580351396215219E+03 +
+                                                             z * (1.0274275204276027E+02 +
+                                                                  z * (6.2056985032913090E+01 +
+                                                                       z * (-6.1864795320187016E-01))))))))))))));
+  ker[6] =
+      8.8071481911347961E+11 +
+      z * (-1.6939286803305203E+11 +
+           z * (-6.8998572040731445E+10 +
+                z * (1.4763245309081314E+10 +
+                     z * (2.5627633609246163E+09 +
+                          z * (-6.2185168968012476E+08 +
+                               z * (-5.9794495983220413E+07 +
+                                    z * (1.6880127953756198E+07 +
+                                         z * (9.7602844962902542E+05 +
+                                              z * (-3.3223441441930021E+05 +
+                                                   z * (-1.1676544869194569E+04 +
+                                                        z * (5.0580351683422405E+03 +
+                                                             z * (1.0274270265494516E+02 +
+                                                                  z * (-6.2056925855365186E+01 +
+                                                                       z * (-6.1865139394069668E-01))))))))))))));
+  ker[7] =
+      4.0355760945670044E+11 +
+      z * (-2.3965569143469864E+11 +
+           z * (2.8555202212474079E+10 +
+                z * (1.0268350564014671E+10 +
+                     z * (-2.7153428193078651E+09 +
+                          z * (-1.0742154109184742E+08 +
+                               z * (9.1597254427343085E+07 +
+                                    z * (-3.3820440907614031E+06 +
+                                         z * (-1.8264985852963410E+06 +
+                                              z * (1.5273911979752057E+05 +
+                                                   z * (2.4620674845030626E+04 +
+                                                        z * (-3.2018769242193171E+03 +
+                                                             z * (-2.3273401859852868E+02 +
+                                                                  z * (4.6013921000662158E+01 +
+                                                                       z * (1.4545168868971416E+00))))))))))))));
+  ker[8] =
+      7.8788892335272430E+10 +
+      z * (-8.3059508064201080E+10 +
+           z * (3.3729904113826824E+10 +
+                z * (-5.2112383911371059E+09 +
+                     z * (-4.8739037675430620E+08 +
+                          z * (2.9274790542423087E+08 +
+                               z * (-2.6048960239921503E+07 +
+                                    z * (-4.8612412938993908E+06 +
+                                         z * (1.0804817251124913E+06 +
+                                              z * (7.6450317512768806E+03 +
+                                                   z * (-1.9742624831436660E+04 +
+                                                        z * (1.1174000998831286E+03 +
+                                                             z * (2.1881091865396468E+02 +
+                                                                  z * (-2.4990037445376750E+01 +
+                                                                       z * (-1.4751750235177532E+00))))))))))))));
+  ker[9] =
+      5.4431675199498835E+09 +
+      z * (-8.9911609880089989E+09 +
+           z * (6.4524338253008757E+09 +
+                z * (-2.5524827004349871E+09 +
+                     z * (5.5676911894064546E+08 +
+                          z * (-4.1488431554843128E+07 +
+                               z * (-1.0665598090794146E+07 +
+                                    z * (3.1271047224752530E+06 +
+                                         z * (-1.9519783923503032E+05 +
+                                              z * (-4.3922624998141677E+04 +
+                                                   z * (8.4980694630406069E+03 +
+                                                        z * (-4.5513609243969356E+01 +
+                                                             z * (-1.2606964777237258E+02 +
+                                                                  z * (9.6946954085586885E+00 +
+                                                                       z * (9.5655144677795612E-01))))))))))))));
+  ker[10] =
+      8.4789650417103708E+07 +
+      z * (-2.2112758120210618E+08 +
+           z * (2.6026568260310274E+08 +
+                z * (-1.8144472126890984E+08 +
+                     z * (8.2222351241519868E+07 +
+                          z * (-2.4702814073680237E+07 +
+                               z * (4.6266378435690673E+06 +
+                                    z * (-3.6630046787425695E+05 +
+                                         z * (-5.5660303410363231E+04 +
+                                              z * (1.9404411093637758E+04 +
+                                                   z * (-1.7921351308312935E+03 +
+                                                        z * (-1.3637112867730119E+02 +
+                                                             z * (4.6577222453584369E+01 +
+                                                                  z * (-2.5969759201692755E+00 +
+                                                                       z * (-4.1640314572497039E-01))))))))))))));
+  ker[11] =
+      6.1722991679871957E+04 +
+      z * (-3.2561466099404311E+05 +
+           z * (7.6621098001583829E+05 +
+                z * (-1.0657807616803099E+06 +
+                     z * (9.7829638830161188E+05 +
+                          z * (-6.2536876825112454E+05 +
+                               z * (2.8527714307530399E+05 +
+                                    z * (-9.2873647411217215E+04 +
+                                         z * (2.0817947751063632E+04 +
+                                              z * (-2.7986023314644049E+03 +
+                                                   z * (6.7849020488592075E+01 +
+                                                        z * (5.4577021011726984E+01 +
+                                                             z * (-1.0538365860573146E+01 +
+                                                                  z * (4.6087004744129911E-01 +
+                                                                       z * (1.0091618212062017E-01))))))))))))));
+} else if (w == 13) {
+  ker[0] =
+      9.8715725867495363E+04 +
+      z * (5.4491110456935549E+05 +
+           z * (1.3504711883426071E+06 +
+                z * (1.9937206140846491E+06 +
+                     z * (1.9607419630386413E+06 +
+                          z * (1.3593773865640305E+06 +
+                               z * (6.8417206432039209E+05 +
+                                    z * (2.5248269397037517E+05 +
+                                         z * (6.7530100970876694E+04 +
+                                              z * (1.2421368748961073E+04 +
+                                                   z * (1.2904654687550299E+03 +
+                                                        z * (-1.9043622268674213E+01 +
+                                                             z * (-3.0093984465361217E+01 +
+                                                                  z * (-4.3050286009489040E+00 +
+                                                                       z * (-1.0957333716725008E-01 +
+                                                                            z * (6.4700345786605579E-02)))))))))))))));
+  ker[1] =
+      1.9828875496808097E+08 +
+      z * (5.4903670125539351E+08 +
+           z * (6.9286979077463162E+08 +
+                z * (5.2512029493765980E+08 +
+                     z * (2.6425362558103892E+08 +
+                          z * (9.1556445104158267E+07 +
+                               z * (2.1561705510027152E+07 +
+                                    z * (3.0985559672616189E+06 +
+                                         z * (1.2373362326658823E+05 +
+                                              z * (-5.0576243647011936E+04 +
+                                                   z * (-1.1169946055009055E+04 +
+                                                        z * (-6.8296542209516542E+02 +
+                                                             z * (9.8972865724808671E+01 +
+                                                                  z * (2.1108975724659501E+01 +
+                                                                       z * (7.2949317004436565E-01 +
+                                                                            z * (-1.9388585893355106E-01)))))))))))))));
+  ker[2] =
+      1.7196758809614983E+10 +
+      z * (3.0879465445278183E+10 +
+           z * (2.4618123595484577E+10 +
+                z * (1.1253303793811750E+10 +
+                     z * (3.1171259341747193E+09 +
+                          z * (4.7074012944133747E+08 +
+                               z * (7.5785249893055111E+06 +
+                                    z * (-1.1816517087616559E+07 +
+                                         z * (-2.1245597183281910E+06 +
+                                              z * (-4.8878193436902722E+04 +
+                                                   z * (3.3275109713863385E+04 +
+                                                        z * (4.2702512274202591E+03 +
+                                                             z * (-9.7437038666761538E+01 +
+                                                                  z * (-6.4297198812570272E+01 +
+                                                                       z * (-3.4300816058693728E+00 +
+                                                                            z * (4.8498196904280277E-01)))))))))))))));
+  ker[3] =
+      3.3083776881353577E+11 +
+      z * (3.9588436413399969E+11 +
+           z * (1.9493985627722607E+11 +
+                z * (4.6205527735932152E+10 +
+                     z * (2.9839860297839913E+09 +
+                          z * (-1.1192579335657008E+09 +
+                               z * (-2.7456096030221754E+08 +
+                                    z * (-8.2958498770184973E+06 +
+                                         z * (5.1047323238754412E+06 +
+                                              z * (6.5307896872028301E+05 +
+                                                   z * (-3.1765222274236821E+04 +
+                                                        z * (-1.2165497317825058E+04 +
+                                                             z * (-3.5079928405373198E+02 +
+                                                                  z * (1.2922884632277874E+02 +
+                                                                       z * (1.0470054474579324E+01 +
+                                                                            z * (-8.4505831021230793E-01)))))))))))))));
+  ker[4] =
+      2.2668873993375439E+12 +
+      z * (1.6860562536749778E+12 +
+           z * (3.9422703517046350E+11 +
+                z * (-1.1607472377983305E+10 +
+                     z * (-1.9585031917561897E+10 +
+                          z * (-2.1090780087868555E+09 +
+                               z * (3.4589095671054310E+08 +
+                                    z * (8.0546642347355247E+07 +
+                                         z * (-1.4139444405488928E+06 +
+                                              z * (-1.5497610127060430E+06 +
+                                                   z * (-5.9810982085323274E+04 +
+                                                        z * (1.9423733298269544E+04 +
+                                                             z * (1.5699250566648977E+03 +
+                                                                  z * (-1.6991812716212596E+02 +
+                                                                       z * (-2.2292134950656113E+01 +
+                                                                            z * (1.0096624953668054E+00)))))))))))))));
+  ker[5] =
+      6.7734720591167568E+12 +
+      z * (2.4256447893117891E+12 +
+           z * (-1.8678883613919861E+11 +
+                z * (-1.6305241755642313E+11 +
+                     z * (-5.0666917387065792E+09 +
+                          z * (5.2270306737951984E+09 +
+                               z * (4.0256106808894646E+08 +
+                                    z * (-1.0594657799485898E+08 +
+                                         z * (-1.1818267555096827E+07 +
+                                              z * (1.5137725917321201E+06 +
+                                                   z * (2.2355863038592847E+05 +
+                                                        z * (-1.6010024066956401E+04 +
+                                                             z * (-3.1287439837941820E+03 +
+                                                                  z * (1.2655005901719436E+02 +
+                                                                       z * (3.4570827323582719E+01 +
+                                                                            z * (-7.1176997517188334E-01)))))))))))))));
+  ker[6] =
+      9.6695220682534785E+12 +
+      z * (-5.5583944938791784E-05 +
+           z * (-8.5538079834550110E+11 +
+                z * (3.5385440504350348E-04 +
+                     z * (3.6568794485480583E+10 +
+                          z * (5.6467240041521856E-04 +
+                               z * (-1.0074306926603404E+09 +
+                                    z * (2.1816722293163801E-04 +
+                                         z * (2.0121548578624789E+07 +
+                                              z * (4.1615986404011299E-04 +
+                                                   z * (-3.1083591705219547E+05 +
+                                                        z * (3.4018642874429026E-04 +
+                                                             z * (3.8692196309709061E+03 +
+                                                                  z * (9.2483537895948854E-05 +
+                                                                       z * (-3.9923523442753932E+01 +
+                                                                            z * (1.8993034357560573E-04)))))))))))))));
+  ker[7] =
+      6.7734720591167432E+12 +
+      z * (-2.4256447893117847E+12 +
+           z * (-1.8678883613919730E+11 +
+                z * (1.6305241755642365E+11 +
+                     z * (-5.0666917387057562E+09 +
+                          z * (-5.2270306737934217E+09 +
+                               z * (4.0256106809081393E+08 +
+                                    z * (1.0594657799424352E+08 +
+                                         z * (-1.1818267557079868E+07 +
+                                              z * (-1.5137725918538549E+06 +
+                                                   z * (2.2355863445202672E+05 +
+                                                        z * (1.6010021599471667E+04 +
+                                                             z * (-3.1287462825615335E+03 +
+                                                                  z * (-1.2655066232531748E+02 +
+                                                                       z * (3.4573264959502886E+01 +
+                                                                            z * (7.1071470529800751E-01)))))))))))))));
+  ker[8] =
+      2.2668873993375430E+12 +
+      z * (-1.6860562536749768E+12 +
+           z * (3.9422703517046375E+11 +
+                z * (1.1607472377982582E+10 +
+                     z * (-1.9585031917561817E+10 +
+                          z * (2.1090780087880819E+09 +
+                               z * (3.4589095670997137E+08 +
+                                    z * (-8.0546642347497791E+07 +
+                                         z * (-1.4139444401348191E+06 +
+                                              z * (1.5497610130469005E+06 +
+                                                   z * (-5.9810982721084511E+04 +
+                                                        z * (-1.9423732817821805E+04 +
+                                                             z * (1.5699252631958864E+03 +
+                                                                  z * (1.6991805207569072E+02 +
+                                                                       z * (-2.2292358612963266E+01 +
+                                                                            z * (-1.0093759853494892E+00)))))))))))))));
+  ker[9] =
+      3.3083776881353503E+11 +
+      z * (-3.9588436413399890E+11 +
+           z * (1.9493985627722589E+11 +
+                z * (-4.6205527735932213E+10 +
+                     z * (2.9839860297838497E+09 +
+                          z * (1.1192579335658383E+09 +
+                               z * (-2.7456096030236483E+08 +
+                                    z * (8.2958498771036500E+06 +
+                                         z * (5.1047323236516044E+06 +
+                                              z * (-6.5307896856811445E+05 +
+                                                   z * (-3.1765222464963932E+04 +
+                                                        z * (1.2165497483905752E+04 +
+                                                             z * (-3.5079944793112952E+02 +
+                                                                  z * (-1.2922893667436634E+02 +
+                                                                       z * (1.0470042004916014E+01 +
+                                                                            z * (8.4513368663187038E-01)))))))))))))));
+  ker[10] =
+      1.7196758809614998E+10 +
+      z * (-3.0879465445278183E+10 +
+           z * (2.4618123595484566E+10 +
+                z * (-1.1253303793811750E+10 +
+                     z * (3.1171259341747184E+09 +
+                          z * (-4.7074012944133127E+08 +
+                               z * (7.5785249893030487E+06 +
+                                    z * (1.1816517087615721E+07 +
+                                         z * (-2.1245597183309775E+06 +
+                                              z * (4.8878193438804832E+04 +
+                                                   z * (3.3275109714208855E+04 +
+                                                        z * (-4.2702512286689680E+03 +
+                                                             z * (-9.7437041893750632E+01 +
+                                                                  z * (6.4297198424711908E+01 +
+                                                                       z * (-3.4300810538570281E+00 +
+                                                                            z * (-4.8498289797755090E-01)))))))))))))));
+  ker[11] =
+      1.9828875496807891E+08 +
+      z * (-5.4903670125538898E+08 +
+           z * (6.9286979077462614E+08 +
+                z * (-5.2512029493765628E+08 +
+                     z * (2.6425362558103728E+08 +
+                          z * (-9.1556445104157984E+07 +
+                               z * (2.1561705510027405E+07 +
+                                    z * (-3.0985559672621777E+06 +
+                                         z * (1.2373362326702787E+05 +
+                                              z * (5.0576243646433126E+04 +
+                                                   z * (-1.1169946054555618E+04 +
+                                                        z * (6.8296542153908558E+02 +
+                                                             z * (9.8972866189610414E+01 +
+                                                                  z * (-2.1108976207523057E+01 +
+                                                                       z * (7.2949352113279253E-01 +
+                                                                            z * (1.9388556864144227E-01)))))))))))))));
+  ker[12] =
+      9.8715725867496090E+04 +
+      z * (-5.4491110456935526E+05 +
+           z * (1.3504711883426069E+06 +
+                z * (-1.9937206140846489E+06 +
+                     z * (1.9607419630386417E+06 +
+                          z * (-1.3593773865640305E+06 +
+                               z * (6.8417206432039209E+05 +
+                                    z * (-2.5248269397037517E+05 +
+                                         z * (6.7530100970876316E+04 +
+                                              z * (-1.2421368748961073E+04 +
+                                                   z * (1.2904654687545376E+03 +
+                                                        z * (1.9043622268312891E+01 +
+                                                             z * (-3.0093984465884773E+01 +
+                                                                  z * (4.3050286009485790E+00 +
+                                                                       z * (-1.0957333740315604E-01 +
+                                                                            z * (-6.4700346061994291E-02)))))))))))))));
+} else if (w == 14) {
+  ker[0] =
+      1.5499533202966207E+05 +
+      z * (8.9188339002980455E+05 +
+           z * (2.3170473769379663E+06 +
+                z * (3.6089249230396422E+06 +
+                     z * (3.7733555140851745E+06 +
+                          z * (2.8079157920112358E+06 +
+                               z * (1.5361613559533111E+06 +
+                                    z * (6.2759409419592959E+05 +
+                                         z * (1.9151404903933613E+05 +
+                                              z * (4.2715272622845026E+04 +
+                                                   z * (6.4806786522793900E+03 +
+                                                        z * (4.9913632908459954E+02 +
+                                                             z * (-3.3076333188134086E+01 +
+                                                                  z * (-1.4394533627743886E+01 +
+                                                                       z * (-1.5925952284027161E+00 +
+                                                                            z * (1.5984868520881029E-02 +
+                                                                                 z * (1.0086883464628714E-02))))))))))))))));
+  ker[1] =
+      4.4723032442444688E+08 +
+      z * (1.3065352538728635E+09 +
+           z * (1.7532505043698256E+09 +
+                z * (1.4278058213962190E+09 +
+                     z * (7.8376718099107409E+08 +
+                          z * (3.0340753492383724E+08 +
+                               z * (8.3513615594416574E+07 +
+                                    z * (1.5741723594963098E+07 +
+                                         z * (1.7156606891563335E+06 +
+                                              z * (-2.2565910611953568E+03 +
+                                                   z * (-3.5474227032974472E+04 +
+                                                        z * (-5.5416668524952684E+03 +
+                                                             z * (-1.8970588563697331E+02 +
+                                                                  z * (5.7000699089242815E+01 +
+                                                                       z * (8.5113930215357829E+00 +
+                                                                            z * (1.2876175212962959E-01 +
+                                                                                 z * (-8.0790909102720715E-02))))))))))))))));
+  ker[2] =
+      5.1495083701694740E+10 +
+      z * (9.9400185225815567E+10 +
+           z * (8.6523535958354309E+10 +
+                z * (4.4296625537022423E+10 +
+                     z * (1.4443117772349569E+10 +
+                          z * (2.9498136661747241E+09 +
+                               z * (3.0077547202708024E+08 +
+                                    z * (-1.5632610223406436E+07 +
+                                         z * (-9.7733523156688716E+06 +
+                                              z * (-1.1769776156959014E+06 +
+                                                   z * (1.8237100709385861E+04 +
+                                                        z * (2.0614058717617296E+04 +
+                                                             z * (1.8160423493164808E+03 +
+                                                                  z * (-1.0101142663923416E+02 +
+                                                                       z * (-2.8993523187012922E+01 +
+                                                                            z * (-9.8358742969175483E-01 +
+                                                                                 z * (2.1844663774917339E-01))))))))))))))));
+  ker[3] =
+      1.2904576022918071E+12 +
+      z * (1.7136059013402405E+12 +
+           z * (9.7455289065487354E+11 +
+                z * (2.9466624630419781E+11 +
+                     z * (4.3197433307418671E+10 +
+                          z * (-6.2820200387919831E+08 +
+                               z * (-1.3749596754067802E+09 +
+                                    z * (-1.9294824907078514E+08 +
+                                         z * (4.2982266233154163E+06 +
+                                              z * (4.0078399907813077E+06 +
+                                                   z * (3.0934714629696816E+05 +
+                                                        z * (-3.2285139072943130E+04 +
+                                                             z * (-6.3715703355644328E+03 +
+                                                                  z * (-3.2954197414395189E+01 +
+                                                                       z * (6.6373454994590404E+01 +
+                                                                            z * (3.7711523389360830E+00 +
+                                                                                 z * (-4.2239625362494765E-01))))))))))))))));
+  ker[4] =
+      1.1534950432785506E+13 +
+      z * (1.0144146621675832E+13 +
+           z * (3.2977972139362314E+12 +
+                z * (3.1903621584503235E+11 +
+                     z * (-7.6585042240585556E+10 +
+                          z * (-2.2372008390623215E+10 +
+                               z * (-6.6733027297557127E+08 +
+                                    z * (4.4643806532434595E+08 +
+                                         z * (5.1660907884347722E+07 +
+                                              z * (-3.8951858063335596E+06 +
+                                                   z * (-1.0394703931686131E+06 +
+                                                        z * (-5.3099550821623425E+03 +
+                                                             z * (1.2525624574329036E+04 +
+                                                                  z * (6.1417879182394654E+02 +
+                                                                       z * (-1.0329574518449559E+02 +
+                                                                            z * (-9.4305498095765508E+00 +
+                                                                                 z * (5.8751729633944827E-01))))))))))))))));
+  ker[5] =
+      4.5650102198520484E+13 +
+      z * (2.3034036018490715E+13 +
+           z * (1.7874626001697781E+12 +
+                z * (-9.8834691411254565E+11 +
+                     z * (-1.8569640140763062E+11 +
+                          z * (1.5217518660584890E+10 +
+                               z * (5.9590333632819109E+09 +
+                                    z * (1.5178998385244830E+07 +
+                                         z * (-1.1279400211155911E+08 +
+                                              z * (-5.0944610754510267E+06 +
+                                                   z * (1.4743920333143482E+06 +
+                                                        z * (1.1559000502166932E+05 +
+                                                             z * (-1.4199806452802783E+04 +
+                                                                  z * (-1.6177283846697430E+03 +
+                                                                       z * (1.0280184257681817E+02 +
+                                                                            z * (1.6842854581416674E+01 +
+                                                                                 z * (-5.3309419914634570E-01))))))))))))))));
+  ker[6] =
+      8.8830582190032641E+13 +
+      z * (1.4630967270448871E+13 +
+           z * (-6.1480918082633916E+12 +
+                z * (-1.1072264714919226E+12 +
+                     z * (2.0385335192657199E+11 +
+                          z * (4.0682590266891922E+10 +
+                               z * (-4.3025685566870070E+09 +
+                                    z * (-9.6771139891725647E+08 +
+                                         z * (6.4701089573962681E+07 +
+                                              z * (1.6765992446914168E+07 +
+                                                   z * (-7.3356882447856572E+05 +
+                                                        z * (-2.2569743259261423E+05 +
+                                                             z * (6.4441892296909591E+03 +
+                                                                  z * (2.4593386157454975E+03 +
+                                                                       z * (-4.3896094875192006E+01 +
+                                                                            z * (-2.2308566502972713E+01 +
+                                                                                 z * (2.0256318800830966E-01))))))))))))))));
+  ker[7] =
+      8.8830582190032641E+13 +
+      z * (-1.4630967270448855E+13 +
+           z * (-6.1480918082633975E+12 +
+                z * (1.1072264714919316E+12 +
+                     z * (2.0385335192656519E+11 +
+                          z * (-4.0682590266869431E+10 +
+                               z * (-4.3025685566872711E+09 +
+                                    z * (9.6771139892509627E+08 +
+                                         z * (6.4701089571562663E+07 +
+                                              z * (-1.6765992426657490E+07 +
+                                                   z * (-7.3356882916658197E+05 +
+                                                        z * (2.2569743616896842E+05 +
+                                                             z * (6.4441909537524216E+03 +
+                                                                  z * (-2.4593322941165261E+03 +
+                                                                       z * (-4.3899302208087086E+01 +
+                                                                            z * (2.2308940200151390E+01 +
+                                                                                 z * (2.0382395473864703E-01))))))))))))))));
+  ker[8] =
+      4.5650102198520492E+13 +
+      z * (-2.3034036018490719E+13 +
+           z * (1.7874626001697690E+12 +
+                z * (9.8834691411255151E+11 +
+                     z * (-1.8569640140762662E+11 +
+                          z * (-1.5217518660582748E+10 +
+                               z * (5.9590333632806673E+09 +
+                                    z * (-1.5178998381042883E+07 +
+                                         z * (-1.1279400211012064E+08 +
+                                              z * (5.0944610781778870E+06 +
+                                                   z * (1.4743920305501707E+06 +
+                                                        z * (-1.1559000130545651E+05 +
+                                                             z * (-1.4199808176873401E+04 +
+                                                                  z * (1.6177291239900730E+03 +
+                                                                       z * (1.0280039795628096E+02 +
+                                                                            z * (-1.6841512668820517E+01 +
+                                                                                 z * (-5.3414483808829394E-01))))))))))))))));
+  ker[9] =
+      1.1534950432785527E+13 +
+      z * (-1.0144146621675846E+13 +
+           z * (3.2977972139362285E+12 +
+                z * (-3.1903621584503467E+11 +
+                     z * (-7.6585042240580856E+10 +
+                          z * (2.2372008390625935E+10 +
+                               z * (-6.6733027297523963E+08 +
+                                    z * (-4.4643806533176166E+08 +
+                                         z * (5.1660907891220264E+07 +
+                                              z * (3.8951858062361716E+06 +
+                                                   z * (-1.0394703929917105E+06 +
+                                                        z * (5.3099543129458480E+03 +
+                                                             z * (1.2525626154733827E+04 +
+                                                                  z * (-6.1417952013923764E+02 +
+                                                                       z * (-1.0329511291885207E+02 +
+                                                                            z * (9.4313524091989347E+00 +
+                                                                                 z * (5.8725463660395827E-01))))))))))))))));
+  ker[10] =
+      1.2904576022918074E+12 +
+      z * (-1.7136059013402405E+12 +
+           z * (9.7455289065487329E+11 +
+                z * (-2.9466624630419769E+11 +
+                     z * (4.3197433307418686E+10 +
+                          z * (6.2820200387968791E+08 +
+                               z * (-1.3749596754067125E+09 +
+                                    z * (1.9294824907065383E+08 +
+                                         z * (4.2982266233826512E+06 +
+                                              z * (-4.0078399907326135E+06 +
+                                                   z * (3.0934714631908614E+05 +
+                                                        z * (3.2285139142872020E+04 +
+                                                             z * (-6.3715704433222418E+03 +
+                                                                  z * (3.2954100943010943E+01 +
+                                                                       z * (6.6373435700858948E+01 +
+                                                                            z * (-3.7710716543179599E+00 +
+                                                                                 z * (-4.2248161803898049E-01))))))))))))))));
+  ker[11] =
+      5.1495083701695107E+10 +
+      z * (-9.9400185225815964E+10 +
+           z * (8.6523535958354630E+10 +
+                z * (-4.4296625537022621E+10 +
+                     z * (1.4443117772349669E+10 +
+                          z * (-2.9498136661747637E+09 +
+                               z * (3.0077547202709383E+08 +
+                                    z * (1.5632610223392555E+07 +
+                                         z * (-9.7733523157112263E+06 +
+                                              z * (1.1769776157141617E+06 +
+                                                   z * (1.8237100665157792E+04 +
+                                                        z * (-2.0614058670790018E+04 +
+                                                             z * (1.8160422729911850E+03 +
+                                                                  z * (1.0101142710333265E+02 +
+                                                                       z * (-2.8993536490606409E+01 +
+                                                                            z * (9.8361025494556609E-01 +
+                                                                                 z * (2.1840693098447247E-01))))))))))))))));
+  ker[12] =
+      4.4723032442444855E+08 +
+      z * (-1.3065352538728662E+09 +
+           z * (1.7532505043698275E+09 +
+                z * (-1.4278058213962219E+09 +
+                     z * (7.8376718099107552E+08 +
+                          z * (-3.0340753492383808E+08 +
+                               z * (8.3513615594416171E+07 +
+                                    z * (-1.5741723594963137E+07 +
+                                         z * (1.7156606891560503E+06 +
+                                              z * (2.2565910606306688E+03 +
+                                                   z * (-3.5474227033406372E+04 +
+                                                        z * (5.5416668533342381E+03 +
+                                                             z * (-1.8970588700495102E+02 +
+                                                                  z * (-5.7000699100179844E+01 +
+                                                                       z * (8.5113924808491728E+00 +
+                                                                            z * (-1.2876100566420701E-01 +
+                                                                                 z * (-8.0791813278079661E-02))))))))))))))));
+  ker[13] =
+      1.5499533202970232E+05 +
+      z * (-8.9188339002979454E+05 +
+           z * (2.3170473769380399E+06 +
+                z * (-3.6089249230396664E+06 +
+                     z * (3.7733555140852560E+06 +
+                          z * (-2.8079157920112377E+06 +
+                               z * (1.5361613559533576E+06 +
+                                    z * (-6.2759409419590747E+05 +
+                                         z * (1.9151404903936724E+05 +
+                                              z * (-4.2715272622820135E+04 +
+                                                   z * (6.4806786523010323E+03 +
+                                                        z * (-4.9913632906195977E+02 +
+                                                             z * (-3.3076333168231550E+01 +
+                                                                  z * (1.4394533639240331E+01 +
+                                                                       z * (-1.5925952194145006E+00 +
+                                                                            z * (-1.5984859433053292E-02 +
+                                                                                 z * (1.0086890728892184E-02))))))))))))))));
+} else if (w == 15) {
+  ker[0] =
+      2.3939707792241839E+05 +
+      z * (1.4314487885226035E+06 +
+           z * (3.8829497354762917E+06 +
+                z * (6.3495763451755755E+06 +
+                     z * (7.0146619045520434E+06 +
+                          z * (5.5580012413990172E+06 +
+                               z * (3.2693972344231778E+06 +
+                                    z * (1.4553539959296256E+06 +
+                                         z * (4.9358776531681651E+05 +
+                                              z * (1.2660319987326677E+05 +
+                                                   z * (2.3793325531458529E+04 +
+                                                        z * (2.9741655196834722E+03 +
+                                                             z * (1.5389176594899303E+02 +
+                                                                  z * (-2.3857631312588978E+01 +
+                                                                       z * (-6.1348505739169541E+00 +
+                                                                            z * (-4.9671584513490097E-01 +
+                                                                                 z * (4.3460786767313729E-03 +
+                                                                                      z * (9.8937951662141730E-03)))))))))))))))));
+  ker[1] =
+      9.7700272582690191E+08 +
+      z * (2.9961416925358453E+09 +
+           z * (4.2473082696966448E+09 +
+                z * (3.6841035003733950E+09 +
+                     z * (2.1782897863065763E+09 +
+                          z * (9.2345162185944164E+08 +
+                               z * (2.8610260147425205E+08 +
+                                    z * (6.4136842048384041E+07 +
+                                         z * (9.7772970960585065E+06 +
+                                              z * (7.7519511328119377E+05 +
+                                                   z * (-4.2305332803808597E+04 +
+                                                        z * (-2.0687056403786246E+04 +
+                                                             z * (-2.3864418511494741E+03 +
+                                                                  z * (-1.9651606133609231E+01 +
+                                                                       z * (2.7872915855267404E+01 +
+                                                                            z * (3.0617550953446115E+00 +
+                                                                                 z * (-1.3199600771767199E-02 +
+                                                                                      z * (-3.0452442308331003E-02)))))))))))))))));
+  ker[2] =
+      1.4715933396485257E+11 +
+      z * (3.0273361232748438E+11 +
+           z * (2.8414312556015540E+11 +
+                z * (1.5965774278321045E+11 +
+                     z * (5.8897780310148087E+10 +
+                          z * (1.4522950934020109E+10 +
+                               z * (2.2348528403750563E+09 +
+                                    z * (1.3622336582062906E+08 +
+                                         z * (-2.3511574237987626E+07 +
+                                              z * (-6.5244610661450895E+06 +
+                                                   z * (-5.2884156985535356E+05 +
+                                                        z * (3.3295507799709936E+04 +
+                                                             z * (1.0846266954249364E+04 +
+                                                                  z * (6.4183083829803820E+02 +
+                                                                       z * (-6.5819942538871970E+01 +
+                                                                            z * (-1.1650665638578070E+01 +
+                                                                                 z * (-1.9412688562910244E-01 +
+                                                                                      z * (8.8476856295623851E-02)))))))))))))))));
+  ker[3] =
+      4.7242424833337158E+12 +
+      z * (6.8507333793903584E+12 +
+           z * (4.3688281331121411E+12 +
+                z * (1.5630338683778201E+12 +
+                     z * (3.1953009601770325E+11 +
+                          z * (2.7025952371212009E+10 +
+                               z * (-3.4574515574242272E+09 +
+                                    z * (-1.2131510424644001E+09 +
+                                         z * (-1.0142613816641946E+08 +
+                                              z * (9.0878257488052379E+06 +
+                                                   z * (2.5307340127864038E+06 +
+                                                        z * (1.0661145730323243E+05 +
+                                                             z * (-2.2940053396478714E+04 +
+                                                                  z * (-2.8648433109641578E+03 +
+                                                                       z * (5.1366231962952028E+01 +
+                                                                            z * (3.0081586723089057E+01 +
+                                                                                 z * (1.1329433700669471E+00 +
+                                                                                      z * (-1.8863254393840059E-01)))))))))))))))));
+  ker[4] =
+      5.3987426629953594E+13 +
+      z * (5.4192702756911000E+13 +
+           z * (2.1823119508000543E+13 +
+                z * (3.8749058615819268E+12 +
+                     z * (4.0651527029737198E+08 +
+                          z * (-1.2304576967641914E+11 +
+                               z * (-1.7480626463583939E+10 +
+                                    z * (6.4322366984221375E+08 +
+                                         z * (3.9421144218035364E+08 +
+                                              z * (2.3116605621149920E+07 +
+                                                   z * (-4.0404175271559842E+06 +
+                                                        z * (-5.6644238105382060E+05 +
+                                                             z * (1.4780106121058996E+04 +
+                                                                  z * (6.8249243722518859E+03 +
+                                                                       z * (1.7213955398158618E+02 +
+                                                                            z * (-5.4028356726202020E+01 +
+                                                                                 z * (-3.4442045795063887E+00 +
+                                                                                      z * (2.9436331238371949E-01)))))))))))))))));
+  ker[5] =
+      2.7580474290566078E+14 +
+      z * (1.7551587948105309E+14 +
+           z * (3.2228098609392094E+13 +
+                z * (-2.7319740087723574E+12 +
+                     z * (-1.6379148273276064E+12 +
+                          z * (-1.0116752717202786E+11 +
+                               z * (3.1608597465540653E+10 +
+                                    z * (4.5078753872047586E+09 +
+                                         z * (-2.8449115593052310E+08 +
+                                              z * (-8.7079594462079599E+07 +
+                                                   z * (-1.7519992360184138E+05 +
+                                                        z * (1.0874811616841732E+06 +
+                                                             z * (4.2663651769852157E+04 +
+                                                                  z * (-9.7944325124827701E+03 +
+                                                                       z * (-6.9658621010000411E+02 +
+                                                                            z * (6.6077203078498044E+01 +
+                                                                                 z * (7.1737626956468912E+00 +
+                                                                                      z * (-3.4557352306805311E-01)))))))))))))))));
+  ker[6] =
+      7.0693378336533400E+14 +
+      z * (2.1874615668430150E+14 +
+           z * (-2.1833085454691789E+13 +
+                z * (-1.3233342822865402E+13 +
+                     z * (-1.1568753137013029E+11 +
+                          z * (3.8517418245458325E+11 +
+                               z * (1.9879262560072273E+10 +
+                                    z * (-7.1689413746930647E+09 +
+                                         z * (-5.7549243243741119E+08 +
+                                              z * (9.5542733739275128E+07 +
+                                                   z * (1.0146438805818636E+07 +
+                                                        z * (-9.6561270266008016E+05 +
+                                                             z * (-1.3047648013242516E+05 +
+                                                                  z * (7.6177757600121276E+03 +
+                                                                       z * (1.3192236112353403E+03 +
+                                                                            z * (-4.7145500171928198E+01 +
+                                                                                 z * (-1.1098109271625262E+01 +
+                                                                                      z * (2.3017883073614101E-01)))))))))))))))));
+  ker[7] =
+      9.6196578554477775E+14 +
+      z * (3.4316191014053393E-02 +
+           z * (-7.3750710225100812E+13 +
+                z * (6.1642230420317079E-02 +
+                     z * (2.7451653250460508E+12 +
+                          z * (1.0918347404432817E-01 +
+                               z * (-6.6148013553772224E+10 +
+                                    z * (3.2906916833662987E-02 +
+                                         z * (1.1608781631182449E+09 +
+                                              z * (6.0548970733798724E-02 +
+                                                   z * (-1.5828545480742473E+07 +
+                                                        z * (1.5626594062671070E-02 +
+                                                             z * (1.7468401314164279E+05 +
+                                                                  z * (1.8034307737205296E-02 +
+                                                                       z * (-1.6054106225233884E+03 +
+                                                                            z * (4.2118837140985958E-03 +
+                                                                                 z * (1.2385772358881393E+01 +
+                                                                                      z * (2.4430769636630701E-03)))))))))))))))));
+  ker[8] =
+      7.0693378336533400E+14 +
+      z * (-2.1874615668430150E+14 +
+           z * (-2.1833085454691820E+13 +
+                z * (1.3233342822865449E+13 +
+                     z * (-1.1568753137012485E+11 +
+                          z * (-3.8517418245444312E+11 +
+                               z * (1.9879262560085339E+10 +
+                                    z * (7.1689413746724453E+09 +
+                                         z * (-5.7549243240763104E+08 +
+                                              z * (-9.5542733661364838E+07 +
+                                                   z * (1.0146438778928882E+07 +
+                                                        z * (9.6561272951271443E+05 +
+                                                             z * (-1.3047645484607235E+05 +
+                                                                  z * (-7.6177559127722052E+03 +
+                                                                       z * (1.3192031991952242E+03 +
+                                                                            z * (4.7167106663349848E+01 +
+                                                                                 z * (-1.1101471316239516E+01 +
+                                                                                      z * (-2.2905948186872507E-01)))))))))))))))));
+  ker[9] =
+      2.7580474290566125E+14 +
+      z * (-1.7551587948105334E+14 +
+           z * (3.2228098609392055E+13 +
+                z * (2.7319740087723975E+12 +
+                     z * (-1.6379148273277261E+12 +
+                          z * (1.0116752717221135E+11 +
+                               z * (3.1608597465515747E+10 +
+                                    z * (-4.5078753875009747E+09 +
+                                         z * (-2.8449115600447333E+08 +
+                                              z * (8.7079594608550951E+07 +
+                                                   z * (-1.7520004389869148E+05 +
+                                                        z * (-1.0874812528712249E+06 +
+                                                             z * (4.2663541429144650E+04 +
+                                                                  z * (9.7944326623113047E+03 +
+                                                                       z * (-6.9663961216547739E+02 +
+                                                                            z * (-6.6048394423269173E+01 +
+                                                                                 z * (7.0913926025978853E+00 +
+                                                                                      z * (3.3773808099613395E-01)))))))))))))))));
+  ker[10] =
+      5.3987426629953766E+13 +
+      z * (-5.4192702756911180E+13 +
+           z * (2.1823119508000594E+13 +
+                z * (-3.8749058615819365E+12 +
+                     z * (4.0651527029819238E+08 +
+                          z * (1.2304576967643665E+11 +
+                               z * (-1.7480626463576942E+10 +
+                                    z * (-6.4322366985365331E+08 +
+                                         z * (3.9421144214381480E+08 +
+                                              z * (-2.3116605559600785E+07 +
+                                                   z * (-4.0404175770437294E+06 +
+                                                        z * (5.6644243308078672E+05 +
+                                                             z * (1.4780036296018619E+04 +
+                                                                  z * (-6.8249058342322496E+03 +
+                                                                       z * (1.7211403815802629E+02 +
+                                                                            z * (5.4062906728994193E+01 +
+                                                                                 z * (-3.4845491148773502E+00 +
+                                                                                      z * (-2.9291324680547492E-01)))))))))))))))));
+  ker[11] =
+      4.7242424833337246E+12 +
+      z * (-6.8507333793903701E+12 +
+           z * (4.3688281331121479E+12 +
+                z * (-1.5630338683778203E+12 +
+                     z * (3.1953009601770361E+11 +
+                          z * (-2.7025952371214943E+10 +
+                               z * (-3.4574515574198236E+09 +
+                                    z * (1.2131510424608817E+09 +
+                                         z * (-1.0142613816429654E+08 +
+                                              z * (-9.0878257522138134E+06 +
+                                                   z * (2.5307340149977510E+06 +
+                                                        z * (-1.0661145838213131E+05 +
+                                                             z * (-2.2940053180976502E+04 +
+                                                                  z * (2.8648407117981119E+03 +
+                                                                       z * (5.1367579954366171E+01 +
+                                                                            z * (-3.0081603709324451E+01 +
+                                                                                 z * (1.1323523856621058E+00 +
+                                                                                      z * (1.8808240940981219E-01)))))))))))))))));
+  ker[12] =
+      1.4715933396485263E+11 +
+      z * (-3.0273361232748438E+11 +
+           z * (2.8414312556015527E+11 +
+                z * (-1.5965774278321042E+11 +
+                     z * (5.8897780310148087E+10 +
+                          z * (-1.4522950934020079E+10 +
+                               z * (2.2348528403750110E+09 +
+                                    z * (-1.3622336582067037E+08 +
+                                         z * (-2.3511574237995699E+07 +
+                                              z * (6.5244610661298726E+06 +
+                                                   z * (-5.2884156989405944E+05 +
+                                                        z * (-3.3295507812197495E+04 +
+                                                             z * (1.0846266927315819E+04 +
+                                                                  z * (-6.4183085438795774E+02 +
+                                                                       z * (-6.5819957939661379E+01 +
+                                                                            z * (1.1650672008416343E+01 +
+                                                                                 z * (-1.9414904754428672E-01 +
+                                                                                      z * (-8.8480036263997225E-02)))))))))))))))));
+  ker[13] =
+      9.7700272582690215E+08 +
+      z * (-2.9961416925358458E+09 +
+           z * (4.2473082696966434E+09 +
+                z * (-3.6841035003733935E+09 +
+                     z * (2.1782897863065763E+09 +
+                          z * (-9.2345162185944211E+08 +
+                               z * (2.8610260147425193E+08 +
+                                    z * (-6.4136842048384242E+07 +
+                                         z * (9.7772970960588697E+06 +
+                                              z * (-7.7519511328133650E+05 +
+                                                   z * (-4.2305332803937294E+04 +
+                                                        z * (2.0687056403630129E+04 +
+                                                             z * (-2.3864418517113058E+03 +
+                                                                  z * (1.9651605969778377E+01 +
+                                                                       z * (2.7872915947616441E+01 +
+                                                                            z * (-3.0617551285208524E+00 +
+                                                                                 z * (-1.3200165079792004E-02 +
+                                                                                      z * (3.0452386410449440E-02)))))))))))))))));
+  ker[14] =
+      2.3939707792242285E+05 +
+      z * (-1.4314487885226049E+06 +
+           z * (3.8829497354762889E+06 +
+                z * (-6.3495763451755764E+06 +
+                     z * (7.0146619045520443E+06 +
+                          z * (-5.5580012413990181E+06 +
+                               z * (3.2693972344231787E+06 +
+                                    z * (-1.4553539959296256E+06 +
+                                         z * (4.9358776531681546E+05 +
+                                              z * (-1.2660319987326639E+05 +
+                                                   z * (2.3793325531459184E+04 +
+                                                        z * (-2.9741655196846405E+03 +
+                                                             z * (1.5389176594779781E+02 +
+                                                                  z * (2.3857631312809222E+01 +
+                                                                       z * (-6.1348505735855374E+00 +
+                                                                            z * (4.9671584437353217E-01 +
+                                                                                 z * (4.3460782759443158E-03 +
+                                                                                      z * (-9.8937951540835553E-03)))))))))))))))));
+} else if (w == 16) {
+  ker[0] =
+      3.6434551345570839E+05 +
+      z * (2.2576246485480359E+06 +
+           z * (6.3730995546265077E+06 +
+                z * (1.0896915393078227E+07 +
+                     z * (1.2655725616100594E+07 +
+                          z * (1.0609303958036326E+07 +
+                               z * (6.6544809363384582E+06 +
+                                    z * (3.1906872142825006E+06 +
+                                         z * (1.1821527096621769E+06 +
+                                              z * (3.3854610744280310E+05 +
+                                                   z * (7.3893334077310064E+04 +
+                                                        z * (1.1778892113375481E+04 +
+                                                             z * (1.2019749667923656E+03 +
+                                                                  z * (3.1189837632471693E+01 +
+                                                                       z * (-1.2975319073401824E+01 +
+                                                                            z * (-2.3155118729954247E+00 +
+                                                                                 z * (-1.5401723686076832E-01 +
+                                                                                      z * (1.1808835093099178E-02 +
+                                                                                           z * (1.0197234627513459E-04))))))))))))))))));
+  ker[1] =
+      2.0744705928579483E+09 +
+      z * (6.6499571180086451E+09 +
+           z * (9.9060026035198078E+09 +
+                z * (9.0890343524593849E+09 +
+                     z * (5.7342804054544210E+09 +
+                          z * (2.6255609052371716E+09 +
+                               z * (8.9490403680928326E+08 +
+                                    z * (2.2785946180651775E+08 +
+                                         z * (4.2281234059839502E+07 +
+                                              z * (5.2176984975081543E+06 +
+                                                   z * (2.6983804209559254E+05 +
+                                                        z * (-4.0077190108724200E+04 +
+                                                             z * (-1.0378455844500613E+04 +
+                                                                  z * (-8.9083493807061564E+02 +
+                                                                       z * (1.8283698218710011E+01 +
+                                                                            z * (1.1938503634469159E+01 +
+                                                                                 z * (9.8067823888634464E-01 +
+                                                                                      z * (-2.5444299558662394E-02 +
+                                                                                           z * (-1.0460024144706743E-02))))))))))))))))));
+  ker[2] =
+      4.0355760945669995E+11 +
+      z * (8.7873753526056287E+11 +
+           z * (8.8097248605449023E+11 +
+                z * (5.3565169504010010E+11 +
+                     z * (2.1822836608899570E+11 +
+                          z * (6.1673589426039413E+10 +
+                               z * (1.1882638725190845E+10 +
+                                    z * (1.3744578972809248E+09 +
+                                         z * (2.8723226058712766E+07 +
+                                              z * (-2.0677283565079328E+07 +
+                                                   z * (-3.6415998561101072E+06 +
+                                                        z * (-1.8372552175909068E+05 +
+                                                             z * (2.6333352653155256E+04 +
+                                                                  z * (4.9454293649337906E+03 +
+                                                                       z * (1.7684015393859755E+02 +
+                                                                            z * (-3.4150562973753665E+01 +
+                                                                                 z * (-4.1900843552415639E+00 +
+                                                                                      z * (-1.5661344238792723E-04 +
+                                                                                           z * (3.2484276751008172E-02))))))))))))))))));
+  ker[3] =
+      1.6364575388763029E+13 +
+      z * (2.5606844387131066E+13 +
+           z * (1.7953384130753688E+13 +
+                z * (7.3004206720038701E+12 +
+                     z * (1.8300700858999690E+12 +
+                          z * (2.6044432099085333E+11 +
+                               z * (8.1552898137823076E+09 +
+                                    z * (-4.3997172592883167E+09 +
+                                         z * (-8.3553955857628822E+08 +
+                                              z * (-3.5831818968518838E+07 +
+                                                   z * (8.4025485849181097E+06 +
+                                                        z * (1.3262878399160223E+06 +
+                                                             z * (1.7117060106301305E+04 +
+                                                                  z * (-1.3124693635095375E+04 +
+                                                                       z * (-1.1059917445033070E+03 +
+                                                                            z * (4.8898615554511437E+01 +
+                                                                                 z * (1.2150534299778382E+01 +
+                                                                                      z * (2.5820071204205225E-01 +
+                                                                                           z * (-7.6529155676856003E-02))))))))))))))))));
+  ker[4] =
+      2.3514830376056538E+14 +
+      z * (2.6313738449330153E+14 +
+           z * (1.2398425545001662E+14 +
+                z * (2.9692333044160066E+13 +
+                     z * (2.7770431049857676E+12 +
+                          z * (-3.5431628074578204E+11 +
+                               z * (-1.2575562817886868E+11 +
+                                    z * (-9.2011130754043922E+09 +
+                                         z * (1.2447304828823066E+09 +
+                                              z * (2.6599346106412742E+08 +
+                                                   z * (4.9278860779345948E+06 +
+                                                        z * (-2.9738539927520575E+06 +
+                                                             z * (-2.5133287443653666E+05 +
+                                                                  z * (1.5834784331991095E+04 +
+                                                                       z * (3.1998168298121523E+03 +
+                                                                            z * (1.5853185548633874E+01 +
+                                                                                 z * (-2.4763139606227178E+01 +
+                                                                                      z * (-1.0930950485268096E+00 +
+                                                                                           z * (1.3630970159476372E-01))))))))))))))))));
+  ker[5] =
+      1.5192201717462528E+15 +
+      z * (1.1495095100701460E+15 +
+           z * (3.0749346493041262E+14 +
+                z * (1.6051737468109549E+13 +
+                     z * (-8.5034969223852568E+12 +
+                          z * (-1.6077602129636348E+12 +
+                               z * (2.7074695075907585E+10 +
+                                    z * (3.4690551711832901E+10 +
+                                         z * (2.1955280943585949E+09 +
+                                              z * (-3.7992777977357000E+08 +
+                                                   z * (-5.1437033846752726E+07 +
+                                                        z * (1.9493509709529271E+06 +
+                                                             z * (6.4713914262131555E+05 +
+                                                                  z * (6.9607870364081436E+03 +
+                                                                       z * (-5.5988200120063057E+03 +
+                                                                            z * (-2.4272678107130790E+02 +
+                                                                                 z * (3.6068014621628578E+01 +
+                                                                                      z * (2.6408492552008669E+00 +
+                                                                                           z * (-1.7034844660140105E-01))))))))))))))))));
+  ker[6] =
+      4.9956173084674090E+15 +
+      z * (2.1932582707747560E+15 +
+           z * (1.0259777520247159E+14 +
+                z * (-9.1273329108089906E+13 +
+                     z * (-1.2846668467423438E+13 +
+                          z * (1.5534405614728977E+12 +
+                               z * (3.9453789461955023E+11 +
+                                    z * (-9.4227043395047741E+09 +
+                                         z * (-7.0514195726908512E+09 +
+                                              z * (-1.3426914417466179E+08 +
+                                                   z * (8.7603898676325440E+07 +
+                                                        z * (4.1881949951139782E+06 +
+                                                             z * (-8.1634942572553246E+05 +
+                                                                  z * (-5.9789871879430451E+04 +
+                                                                       z * (5.9248751921324047E+03 +
+                                                                            z * (6.0151276286907887E+02 +
+                                                                                 z * (-3.4346647779134791E+01 +
+                                                                                      z * (-4.4415763059111955E+00 +
+                                                                                           z * (5.0810563000817377E-01))))))))))))))))));
+  ker[7] =
+      8.9287666945127360E+15 +
+      z * (1.2860244365132595E+15 +
+           z * (-5.5291976457534325E+14 +
+                z * (-8.5999306918502953E+13 +
+                     z * (1.6519076896571838E+13 +
+                          z * (2.8019935380857432E+12 +
+                               z * (-3.1679644857468066E+11 +
+                                    z * (-5.9308465070198639E+10 +
+                                         z * (4.3745141239718714E+09 +
+                                              z * (9.1752051229224503E+08 +
+                                                   z * (-4.6199498412402093E+07 +
+                                                        z * (-1.1066749616505133E+07 +
+                                                             z * (3.8623935281825601E+05 +
+                                                                  z * (1.0841726514394575E+05 +
+                                                                       z * (-2.5990022806343668E+03 +
+                                                                            z * (-8.8751856926690448E+02 +
+                                                                                 z * (1.3259903958585387E+01 +
+                                                                                      z * (6.8227366238712817E+00 +
+                                                                                           z * (4.5271530858828507E-01))))))))))))))))));
+  ker[8] =
+      8.9287666945127390E+15 +
+      z * (-1.2860244365132600E+15 +
+           z * (-5.5291976457534325E+14 +
+                z * (8.5999306918502422E+13 +
+                     z * (1.6519076896572182E+13 +
+                          z * (-2.8019935380841978E+12 +
+                               z * (-3.1679644857392346E+11 +
+                                    z * (5.9308465069336540E+10 +
+                                         z * (4.3745141233600502E+09 +
+                                              z * (-9.1752051129499328E+08 +
+                                                   z * (-4.6199498208604209E+07 +
+                                                        z * (1.1066749327519676E+07 +
+                                                             z * (3.8623876433339820E+05 +
+                                                                  z * (-1.0841709685990328E+05 +
+                                                                       z * (-2.5990962125709430E+03 +
+                                                                            z * (8.8742942550355474E+02 +
+                                                                                 z * (1.2937147675617604E+01 +
+                                                                                      z * (-6.8186662643534008E+00 +
+                                                                                           z * (4.1898600671806535E-01))))))))))))))))));
+  ker[9] =
+      4.9956173084674090E+15 +
+      z * (-2.1932582707747578E+15 +
+           z * (1.0259777520247186E+14 +
+                z * (9.1273329108089984E+13 +
+                     z * (-1.2846668467423555E+13 +
+                          z * (-1.5534405614724106E+12 +
+                               z * (3.9453789461966650E+11 +
+                                    z * (9.4227043396350136E+09 +
+                                         z * (-7.0514195728029747E+09 +
+                                              z * (1.3426914497246322E+08 +
+                                                   z * (8.7603898435731798E+07 +
+                                                        z * (-4.1881946843906553E+06 +
+                                                             z * (-8.1634960962672008E+05 +
+                                                                  z * (5.9790206615067997E+04 +
+                                                                       z * (5.9247537039895724E+03 +
+                                                                            z * (-6.0136491467620624E+02 +
+                                                                                 z * (-3.4454233206790519E+01 +
+                                                                                      z * (4.4887924763186051E+00 +
+                                                                                           z * (4.8690877516313513E-01))))))))))))))))));
+  ker[10] =
+      1.5192201717462528E+15 +
+      z * (-1.1495095100701465E+15 +
+           z * (3.0749346493041219E+14 +
+                z * (-1.6051737468109510E+13 +
+                     z * (-8.5034969223850703E+12 +
+                          z * (1.6077602129635625E+12 +
+                               z * (2.7074695075992649E+10 +
+                                    z * (-3.4690551711738396E+10 +
+                                         z * (2.1955280943510208E+09 +
+                                              z * (3.7992777991069216E+08 +
+                                                   z * (-5.1437033863736227E+07 +
+                                                        z * (-1.9493507810665092E+06 +
+                                                             z * (6.4713900469564367E+05 +
+                                                                  z * (-6.9607049368128291E+03 +
+                                                                       z * (-5.5988835070734467E+03 +
+                                                                            z * (2.4282489356694586E+02 +
+                                                                                 z * (3.6027670086257579E+01 +
+                                                                                      z * (-2.6327085361651021E+00 +
+                                                                                           z * (-1.8551116028617770E-01))))))))))))))))));
+  ker[11] =
+      2.3514830376056538E+14 +
+      z * (-2.6313738449330159E+14 +
+           z * (1.2398425545001659E+14 +
+                z * (-2.9692333044160082E+13 +
+                     z * (2.7770431049857896E+12 +
+                          z * (3.5431628074580896E+11 +
+                               z * (-1.2575562817884555E+11 +
+                                    z * (9.2011130753567543E+09 +
+                                         z * (1.2447304828590808E+09 +
+                                              z * (-2.6599346104854536E+08 +
+                                                   z * (4.9278861005789889E+06 +
+                                                        z * (2.9738539818831389E+06 +
+                                                             z * (-2.5133289627502396E+05 +
+                                                                  z * (-1.5834783935893831E+04 +
+                                                                       z * (3.1998292349030621E+03 +
+                                                                            z * (-1.5850195971204462E+01 +
+                                                                                 z * (-2.4769863695455662E+01 +
+                                                                                      z * (1.0918739406714428E+00 +
+                                                                                           z * (1.3847580354973169E-01))))))))))))))))));
+  ker[12] =
+      1.6364575388763035E+13 +
+      z * (-2.5606844387131062E+13 +
+           z * (1.7953384130753676E+13 +
+                z * (-7.3004206720038701E+12 +
+                     z * (1.8300700858999678E+12 +
+                          z * (-2.6044432099084848E+11 +
+                               z * (8.1552898137788668E+09 +
+                                    z * (4.3997172592879610E+09 +
+                                         z * (-8.3553955857879233E+08 +
+                                              z * (3.5831818968908392E+07 +
+                                                   z * (8.4025485831489991E+06 +
+                                                        z * (-1.3262878384774840E+06 +
+                                                             z * (1.7117057951236206E+04 +
+                                                                  z * (1.3124692974990443E+04 +
+                                                                       z * (-1.1059926481090836E+03 +
+                                                                            z * (-4.8897392545563044E+01 +
+                                                                                 z * (1.2149431128889342E+01 +
+                                                                                      z * (-2.5844238963842503E-01 +
+                                                                                           z * (-7.6732227735133926E-02))))))))))))))))));
+  ker[13] =
+      4.0355760945670026E+11 +
+      z * (-8.7873753526056299E+11 +
+           z * (8.8097248605448950E+11 +
+                z * (-5.3565169504010022E+11 +
+                     z * (2.1822836608899567E+11 +
+                          z * (-6.1673589426039429E+10 +
+                               z * (1.1882638725190889E+10 +
+                                    z * (-1.3744578972813025E+09 +
+                                         z * (2.8723226058761366E+07 +
+                                              z * (2.0677283564896725E+07 +
+                                                   z * (-3.6415998560990733E+06 +
+                                                        z * (1.8372552162922107E+05 +
+                                                             z * (2.6333352581335013E+04 +
+                                                                  z * (-4.9454295091588992E+03 +
+                                                                       z * (1.7684013881079576E+02 +
+                                                                            z * (3.4150562973753665E+01 +
+                                                                                 z * (-4.1901615115388706E+00 +
+                                                                                      z * (1.2680123888735934E-04 +
+                                                                                           z * (3.2481254845378803E-02))))))))))))))))));
+  ker[14] =
+      2.0744705928579524E+09 +
+      z * (-6.6499571180086451E+09 +
+           z * (9.9060026035198040E+09 +
+                z * (-9.0890343524593849E+09 +
+                     z * (5.7342804054544210E+09 +
+                          z * (-2.6255609052371716E+09 +
+                               z * (8.9490403680928278E+08 +
+                                    z * (-2.2785946180651844E+08 +
+                                         z * (4.2281234059838109E+07 +
+                                              z * (-5.2176984975075833E+06 +
+                                                   z * (2.6983804209473461E+05 +
+                                                        z * (4.0077190107319519E+04 +
+                                                             z * (-1.0378455846609291E+04 +
+                                                                  z * (8.9083493794871868E+02 +
+                                                                       z * (1.8283698123134819E+01 +
+                                                                            z * (-1.1938504430698943E+01 +
+                                                                                 z * (9.8067695636810759E-01 +
+                                                                                      z * (2.5444206395526567E-02 +
+                                                                                           z * (-1.0459995814341391E-02))))))))))))))))));
+  ker[15] =
+      3.6434551345571183E+05 +
+      z * (-2.2576246485480373E+06 +
+           z * (6.3730995546265030E+06 +
+                z * (-1.0896915393078227E+07 +
+                     z * (1.2655725616100591E+07 +
+                          z * (-1.0609303958036322E+07 +
+                               z * (6.6544809363384554E+06 +
+                                    z * (-3.1906872142825015E+06 +
+                                         z * (1.1821527096621762E+06 +
+                                              z * (-3.3854610744279937E+05 +
+                                                   z * (7.3893334077307401E+04 +
+                                                        z * (-1.1778892113376129E+04 +
+                                                             z * (1.2019749667911419E+03 +
+                                                                  z * (-3.1189837631106176E+01 +
+                                                                       z * (-1.2975319073977776E+01 +
+                                                                            z * (2.3155118723150525E+00 +
+                                                                                 z * (-1.5401723756214594E-01 +
+                                                                                      z * (-1.1808834826225629E-02 +
+                                                                                           z * (1.0197234627511919E-04))))))))))))))))));
+} else
+  printf("width not implemented!\n");
diff --git a/devel/ker_horner_allw_loop_notpadded.c b/devel/ker_horner_allw_loop_notpadded.c
index 6ea1b8bf5..5a4f1d65c 100644
--- a/devel/ker_horner_allw_loop_notpadded.c
+++ b/devel/ker_horner_allw_loop_notpadded.c
@@ -1,216 +1,886 @@
 // Code generated by gen_all_horner_C_code.m in finufft/devel
 // Authors: Alex Barnett & Ludvig af Klinteberg.
 // (C) 2018, The Simons Foundation, Inc.
-  if (w==2) {
-    FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01};
-    FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01};
-    FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00};
-    FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01};
-    FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00};
-    FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00};
-    for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i])))));
-  } else if (w==3) {
-    FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02};
-    FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02};
-    FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02};
-    FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01};
-    FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01};
-    FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00};
-    FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00};
-    for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i]))))));
-  } else if (w==4) {
-    FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02};
-    FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03};
-    FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03};
-    FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02};
-    FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01};
-    FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01};
-    FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01};
-    FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00};
-    for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i])))))));
-  } else if (w==5) {
-    FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02};
-    FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03};
-    FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03};
-    FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03};
-    FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02};
-    FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01};
-    FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01};
-    FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00};
-    FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00};
-    for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i]))))))));
-  } else if (w==6) {
-    FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03};
-    FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03};
-    FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04};
-    FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03};
-    FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03};
-    FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02};
-    FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02};
-    FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01};
-    FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00};
-    FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00};
-    for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i])))))))));
-  } else if (w==7) {
-    FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03};
-    FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04};
-    FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04};
-    FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04};
-    FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04};
-    FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03};
-    FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02};
-    FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02};
-    FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01};
-    FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01};
-    FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00};
-    for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
-  } else if (w==8) {
-    FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03};
-    FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04};
-    FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04};
-    FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04};
-    FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04};
-    FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04};
-    FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03};
-    FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00};
-    FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02};
-    FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01};
-    FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01};
-    for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i]))))))))));
-  } else if (w==9) {
-    FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04};
-    FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04};
-    FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05};
-    FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05};
-    FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04};
-    FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04};
-    FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04};
-    FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03};
-    FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01};
-    FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01};
-    FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00};
-    FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01};
-    for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i])))))))))));
-  } else if (w==10) {
-    FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04};
-    FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05};
-    FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05};
-    FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05};
-    FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05};
-    FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05};
-    FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04};
-    FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03};
-    FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02};
-    FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02};
-    FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01};
-    FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00};
-    FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01};
-    for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i]))))))))))));
-  } else if (w==11) {
-    FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04};
-    FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05};
-    FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05};
-    FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05};
-    FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05};
-    FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05};
-    FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05};
-    FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04};
-    FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03};
-    FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02};
-    FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01};
-    FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01};
-    FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00};
-    FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01};
-    for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i])))))))))))));
-  } else if (w==12) {
-    FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04};
-    FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05};
-    FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05};
-    FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06};
-    FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05};
-    FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05};
-    FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05};
-    FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04};
-    FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04};
-    FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03};
-    FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01};
-    FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01};
-    FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01};
-    FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01};
-    for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i])))))))))))));
-  } else if (w==13) {
-    FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04};
-    FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05};
-    FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06};
-    FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06};
-    FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06};
-    FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06};
-    FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05};
-    FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05};
-    FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04};
-    FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04};
-    FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03};
-    FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01};
-    FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01};
-    FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00};
-    FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01};
-    for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i]))))))))))))));
-  } else if (w==14) {
-    FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05};
-    FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05};
-    FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06};
-    FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06};
-    FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06};
-    FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06};
-    FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06};
-    FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05};
-    FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05};
-    FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04};
-    FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03};
-    FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02};
-    FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01};
-    FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01};
-    FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00};
-    FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02};
-    for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i])))))))))))))));
-  } else if (w==15) {
-    FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05};
-    FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06};
-    FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06};
-    FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06};
-    FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06};
-    FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06};
-    FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06};
-    FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06};
-    FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05};
-    FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05};
-    FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04};
-    FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03};
-    FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02};
-    FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01};
-    FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00};
-    FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01};
-    FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03};
-    for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i]))))))))))))))));
-  } else if (w==16) {
-    FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05};
-    FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06};
-    FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06};
-    FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07};
-    FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07};
-    FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07};
-    FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06};
-    FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06};
-    FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06};
-    FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05};
-    FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04};
-    FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04};
-    FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03};
-    FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01};
-    FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01};
-    FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00};
-    FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01};
-    FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02};
-    for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i])))))))))))))))));
-  } else
-    printf("width not implemented!\n");
+if (w == 2) {
+  FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01};
+  FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01};
+  FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00};
+  FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01};
+  FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00};
+  FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00};
+  for (int i = 0; i < 2; i++)
+    ker[i] = c0[i] + z * (c1[i] + z * (c2[i] + z * (c3[i] + z * (c4[i] + z * (c5[i])))));
+} else if (w == 3) {
+  FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02};
+  FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02};
+  FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02};
+  FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01};
+  FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01};
+  FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00};
+  FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00};
+  for (int i = 0; i < 3; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] + z * (c2[i] + z * (c3[i] + z * (c4[i] + z * (c5[i] + z * (c6[i]))))));
+} else if (w == 4) {
+  FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04,
+              5.4284366850213223E+02};
+  FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03,
+              -1.4650917259256942E+03};
+  FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03,
+              1.4186910680718338E+03};
+  FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03,
+              -5.1133995502497419E+02};
+  FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01,
+              -4.8293622641173634E+01};
+  FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02,
+              7.8386867802392175E+01};
+  FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00,
+              -1.0039212571700721E+01};
+  FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00,
+              -4.7282853097647068E+00};
+  for (int i = 0; i < 4; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] + z * (c4[i] + z * (c5[i] + z * (c6[i] + z * (c7[i])))))));
+} else if (w == 5) {
+  FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04,
+              3.7794697666613290E+04, 9.9223677575398494E+02};
+  FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12,
+              -3.7938404259811381E+04, -3.0430174925083838E+03};
+  FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04,
+              7.7501368899498684E+03, 3.6092689177271227E+03};
+  FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12,
+              3.8875294641277342E+03, -1.9990077310495410E+03};
+  FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03,
+              -1.5861137916762589E+03, 4.0071733590403880E+02};
+  FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11,
+              -1.2316471075215136E+02, 9.1301168206167446E+01};
+  FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02,
+              1.1960590540261727E+02, -5.5339722671222638E+01};
+  FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12,
+              -2.2839981872938613E+00, 3.3762488150346224E+00};
+  FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00,
+              -5.3664382311060113E+00, 2.5183531846825087E+00};
+  for (int i = 0; i < 5; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] + z * (c6[i] + z * (c7[i] + z * (c8[i]))))))));
+} else if (w == 6) {
+  FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05,
+              8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03};
+  FLT c1[] = {7.1269776034442639E+03,  2.0581923258843314E+05,  3.1559612614917662E+05,
+              -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03};
+  FLT c2[] = {1.0023404568475091E+04,  9.0916650498360163E+04, -1.0095927514054625E+05,
+              -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04};
+  FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03,  -5.0512736602018493E+04,
+              5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03};
+  FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03,
+              5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03};
+  FLT c5[] = {3.2120291706547602E+02,  -1.8229189469937089E+03, 3.7928113414428362E+03,
+              -3.7928113414427862E+03, 1.8229189469936987E+03,  -3.2120291705638107E+02};
+  FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02,
+              -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02};
+  FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02,  -1.7819720186493950E+02,
+              1.7819720186493225E+02,  -1.1536880606854527E+02, 4.5977202622148695E+01};
+  FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02,
+              -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00};
+  FLT c9[] = {1.7872002109952807E+00,  -4.0452381056429791E+00, 5.8969107680858182E+00,
+              -5.8969107681844992E+00, 4.0452381056487843E+00,  -1.7872002036951482E+00};
+  for (int i = 0; i < 6; i++)
+    ker[i] = c0[i] +
+             z * (c1[i] +
+                  z * (c2[i] +
+                       z * (c3[i] +
+                            z * (c4[i] +
+                                 z * (c5[i] +
+                                      z * (c6[i] +
+                                           z * (c7[i] + z * (c8[i] + z * (c9[i])))))))));
+} else if (w == 7) {
+  FLT c0[]  = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06,
+               9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05,
+               3.9948351830642591E+03};
+  FLT c1[]  = {1.5290160332974685E+04,  8.7628248584320396E+05,  3.4421061790934447E+06,
+               -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05,
+               -1.5290160332958067E+04};
+  FLT c2[]  = {2.4458227486779248E+04,  5.3904618484139408E+05, 2.4315566181017426E+05,
+               -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05,
+               2.4458227486795098E+04};
+  FLT c3[]  = {2.1166189345881645E+04,  1.3382732160223144E+05, -3.3113450969689671E+05,
+               -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05,
+               -2.1166189345866882E+04};
+  FLT c4[]  = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04,
+               1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03,
+               1.0542795672361211E+04};
+  FLT c5[]  = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04,
+               1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04,
+               -2.7903491906078302E+03};
+  FLT c6[]  = {1.6069721418053450E+02,  -1.5518707872250775E+03, 4.3634273936637373E+03,
+               -5.9891976420593228E+03, 4.3634273936637110E+03,  -1.5518707872251396E+03,
+               1.6069721419533406E+02};
+  FLT c7[]  = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02,
+               -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02,
+               1.2289277375320185E+02};
+  FLT c8[]  = {-3.2270164914248042E+01, 9.1892112257600488E+01,  -1.6710678096332572E+02,
+               2.0317049305437533E+02,  -1.6710678096375165E+02, 9.1892112257478516E+01,
+               -3.2270164900225943E+01};
+  FLT c9[]  = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00,
+               2.0325596081255337E-10,  -1.2845147731561355E+00, 9.1862771288504130E-01,
+               1.4761410890750706E-01};
+  FLT c10[] = {1.0330620799191630E+00,  -2.6798144967451138E+00, 4.4142511561803381E+00,
+               -5.1799254918189979E+00, 4.4142511544246821E+00,  -2.6798144968294695E+00,
+               1.0330620914479023E+00};
+  for (int i = 0; i < 7; i++)
+    ker[i] = c0[i] +
+             z * (c1[i] +
+                  z * (c2[i] +
+                       z * (c3[i] +
+                            z * (c4[i] +
+                                 z * (c5[i] +
+                                      z * (c6[i] +
+                                           z * (c7[i] +
+                                                z * (c8[i] +
+                                                     z * (c9[i] + z * (c10[i]))))))))));
+} else if (w == 8) {
+  FLT c0[]  = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07,
+               8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07,
+               1.7297637497600049E+06, 7.3898000697448042E+03};
+  FLT c1[]  = {3.0719636811267595E+04,  3.1853145713323937E+06,  2.3797981861403696E+07,
+               2.4569731244678468E+07,  -2.4569731244678464E+07, -2.3797981861403700E+07,
+               -3.1853145713323932E+06, -3.0719636811267599E+04};
+  FLT c2[]  = {5.4488498478251720E+04,  2.4101183255475122E+06,  6.4554051283428278E+06,
+               -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06,
+               2.4101183255475122E+06,  5.4488498478251720E+04};
+  FLT c3[]  = {5.3926359802542131E+04,  9.0469037926849292E+05, -6.0897036277696094E+05,
+               -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05,
+               -9.0469037926849292E+05, -5.3926359802542116E+04};
+  FLT c4[]  = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05,
+               4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05,
+               1.3079802224392162E+05, 3.2444118016247587E+04};
+  FLT c5[]  = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04,
+               1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04,
+               2.2700360645707922E+04, -1.1864306345505289E+04};
+  FLT c6[]  = {2.2812256770903182E+03,  -1.1569135767378117E+04, 2.0942387020799080E+04,
+               -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04,
+               -1.1569135767377431E+04, 2.2812256770903446E+03};
+  FLT c7[]  = {8.5503535636805026E+00,  -9.7513976461269635E+02, 3.8242995179157779E+03,
+               -6.9201295567256420E+03, 6.9201295567222760E+03,  -3.8242995179195914E+03,
+               9.7513976461218783E+02,  -8.5503535636857091E+00};
+  FLT c8[]  = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02,
+               1.9106407993005959E+02,  1.9106407993232122E+02, -3.8638201738334749E+02,
+               2.8246898554236805E+02,  -1.0230637348345877E+02};
+  FLT c9[]  = {-1.9200143062948566E+01, 6.1692257626799076E+01,  -1.2981109187842986E+02,
+               1.8681284209951576E+02,  -1.8681284210285929E+02, 1.2981109187694383E+02,
+               -6.1692257626659767E+01, 1.9200143062946392E+01};
+  FLT c10[] = {3.7894993760901435E-01,  -1.7334408837152924E+00, 2.5271184066312142E+00,
+               -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00,
+               -1.7334408836731170E+00, 3.7894993761824158E-01};
+  for (int i = 0; i < 8; i++)
+    ker[i] = c0[i] +
+             z * (c1[i] +
+                  z * (c2[i] +
+                       z * (c3[i] +
+                            z * (c4[i] +
+                                 z * (c5[i] +
+                                      z * (c6[i] +
+                                           z * (c7[i] +
+                                                z * (c8[i] +
+                                                     z * (c9[i] + z * (c10[i]))))))))));
+} else if (w == 9) {
+  FLT c0[]  = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08,
+               5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08,
+               1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04};
+  FLT c1[]  = {5.8623313038274340E+04,  1.0326318537280340E+07,  1.2898448324824861E+08,
+               3.0522863709830379E+08,  2.2777200847591304E-08,  -3.0522863709830391E+08,
+               -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04};
+  FLT c2[]  = {1.1335001341875963E+05,  9.0726133144784775E+06,  5.3501544534038082E+07,
+               -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05,
+               5.3501544534038067E+07,  9.0726133144785129E+06,  1.1335001341875964E+05};
+  FLT c3[]  = {1.2489113703229750E+05,  4.3035547171861930E+06,  6.3021978510598894E+06,
+               -2.6014941986659020E+07, 2.8258041381448560E-08,  2.6014941986659355E+07,
+               -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05};
+  FLT c4[]  = {8.6425493435991229E+04,  1.0891182836653332E+06, -2.0713033564200329E+06,
+               -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06,
+               -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04};
+  FLT c5[]  = {3.8657354724013807E+04, 7.9936390113327987E+04,  -7.0458265546792350E+05,
+               1.0151095605715724E+06, 8.7808418931366203E-08,  -1.0151095605718571E+06,
+               7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04};
+  FLT c6[]  = {1.0779131453134632E+04,  -3.3466718311303863E+04, -1.3245366619006214E+04,
+               1.8238470515351585E+05,  -2.9285656292984058E+05, 1.8238470515350348E+05,
+               -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04};
+  FLT c7[]  = {1.4992527030548451E+03,  -9.7024371533906651E+03, 2.3216330734046409E+04,
+               -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04,
+               -2.3216330734079289E+04, 9.7024371533883768E+03,  -1.4992527030548429E+03};
+  FLT c8[]  = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03,
+               -6.1806593581211082E+03, 8.0679596873751289E+03,  -6.1806593581509942E+03,
+               2.6054813773256465E+03,  -4.0585588535330419E+02, -7.9857427421164303E+01};
+  FLT c9[]  = {-7.1572272057931258E+01, 2.2785637019446185E+02,  -3.9109820765219445E+02,
+               3.3597424707607246E+02,  1.7793576396134983E-08,  -3.3597424727519928E+02,
+               3.9109820766111056E+02,  -2.2785637019102543E+02, 7.1572272057951565E+01};
+  FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01,  -8.5251867695464824E+01,
+               1.4285748015591199E+02,  -1.6935269673908536E+02, 1.4285748008591776E+02,
+               -8.5251867720434134E+01, 3.5359026945818123E+01,  -9.8886360698009241E+00};
+  FLT c11[] = {5.4050464453063796E-01,  -1.7215219066697895E+00, 2.8631741265441102E+00,
+               -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00,
+               -2.8631741497139487E+00, 1.7215219081941548E+00,  -5.4050464453541269E-01};
+  for (int i = 0; i < 9; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] +
+                                 z * (c6[i] +
+                                      z * (c7[i] +
+                                           z * (c8[i] +
+                                                z * (c9[i] +
+                                                     z * (c10[i] + z * (c11[i])))))))))));
+} else if (w == 10) {
+  FLT c0[]  = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08,
+               3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09,
+               3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07,
+               2.2594586605749315E+04};
+  FLT c1[]  = {1.0729981697645642E+05,  3.0651490267742988E+07,  5.9387966085130477E+08,
+               2.4434902657508340E+09,  2.0073077861288924E+09,  -2.0073077861288958E+09,
+               -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07,
+               -1.0729981697645631E+05};
+  FLT c2[]  = {2.2340399734184594E+05, 3.0258214643190444E+07,  3.1512411458738214E+08,
+               4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08,
+               4.3618276932319820E+08, 3.1512411458738226E+08,  3.0258214643190306E+07,
+               2.2340399734184553E+05};
+  FLT c3[]  = {2.6917433004353492E+05,  1.6875651476661235E+07,  7.4664745481963485E+07,
+               -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08,
+               9.5882157211117893E+07,  -7.4664745481963441E+07, -1.6875651476661157E+07,
+               -2.6917433004353417E+05};
+  FLT c4[]  = {2.0818422772177903E+05,  5.6084730690362593E+06, 1.4435118192352918E+06,
+               -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07,
+               -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06,
+               2.0818422772177853E+05};
+  FLT c5[]  = {1.0781139496011089E+05,  9.9202615851199115E+05, -3.3266265543962144E+06,
+               -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07,
+               4.8557049011624791E+05,  3.3266265543963145E+06, -9.9202615851196367E+05,
+               -1.0781139496011069E+05};
+  FLT c6[]  = {3.7380102688153507E+04, 1.2716675000354149E+04,  -6.2163527451780590E+05,
+               1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05,
+               1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04,
+               3.7380102688153551E+04};
+  FLT c7[]  = {8.1238936393894865E+03,  -3.4872365530450799E+04, 2.3913680325180554E+04,
+               1.2428850301840073E+05,  -3.2158255329732876E+05, 3.2158255329921009E+05,
+               -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04,
+               -8.1238936393893855E+03};
+  FLT c8[]  = {7.8515926628983277E+02,  -6.6607899119362401E+03, 2.0167398338517272E+04,
+               -2.8951401344174039E+04, 1.4622828141519254E+04,  1.4622828143473866E+04,
+               -2.8951401346529910E+04, 2.0167398338405819E+04,  -6.6607899119515532E+03,
+               7.8515926628964587E+02};
+  FLT c9[]  = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03,
+               -4.3921059353471846E+03, 7.3232085265419046E+03,  -7.3232085280635902E+03,
+               4.3921059363220147E+03,  -1.3576976854281722E+03, 3.5304284184270628E+01,
+               1.0147176570551520E+02};
+  FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02,  -3.1771250772612478E+02,
+               3.7215448793727404E+02,  -1.7181762882439287E+02, -1.7181763008770599E+02,
+               3.7215448759715150E+02,  -3.1771250770992856E+02, 1.5498490982321766E+02,
+               -4.3161545259481535E+01};
+  FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01,  -4.7947588102062113E+01,
+               9.2697697983158491E+01,  -1.2821427595919303E+02, 1.2821427694451660E+02,
+               -9.2697698629471930E+01, 4.7947588133767717E+01,  -1.7402146075416606E+01,
+               4.2916172038784923E+00};
+  FLT c12[] = {3.5357495062947814E-01,  -1.2828127005767840E+00, 2.4090120532215455E+00,
+               -2.6448901913160028E+00, 1.1811546776400381E+00,  1.1811568523765217E+00,
+               -2.6448918925210712E+00, 2.4090119216851607E+00,  -1.2828127015358992E+00,
+               3.5357495059093369E-01};
+  for (int i = 0; i < 10; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] +
+                                 z * (c6[i] +
+                                      z * (c7[i] +
+                                           z * (c8[i] +
+                                                z * (c9[i] +
+                                                     z * (c10[i] +
+                                                          z * (c11[i] +
+                                                               z * (c12[i]))))))))))));
+} else if (w == 11) {
+  FLT c0[]  = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09,
+               1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10,
+               6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09,
+               3.4782300224660806E+07, 3.7794653219808897E+04};
+  FLT c1[]  = {1.8969206922085880E+05,  8.4769319065313682E+07,  2.4230555767723408E+09,
+               1.5439732722639105E+10,  2.7112836839612309E+10,  2.9154817084916870E-06,
+               -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09,
+               -8.4769319065313682E+07, -1.8969206922085711E+05};
+  FLT c2[]  = {4.2138380313901423E+05,  9.2050522922791898E+07,  1.5259983101266611E+09,
+               4.7070559561237154E+09,  -1.2448027572952452E+09, -1.0161446790279312E+10,
+               -1.2448027572952352E+09, 4.7070559561237249E+09,  1.5259983101266615E+09,
+               9.2050522922791868E+07,  4.2138380313901143E+05};
+  FLT c3[]  = {5.4814313598122017E+05,  5.8085130777589574E+07,  4.9484006166551107E+08,
+               1.6222124676640958E+08,  -2.0440440381345322E+09, -1.0628188648962249E-06,
+               2.0440440381345263E+09,  -1.6222124676641047E+08, -4.9484006166551083E+08,
+               -5.8085130777589560E+07, -5.4814313598121691E+05};
+  FLT c4[]  = {4.6495183529254969E+05,  2.3067199578027174E+07,  6.9832590192482829E+07,
+               -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08,
+               -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07,
+               2.3067199578027155E+07,  4.6495183529254753E+05};
+  FLT c5[]  = {2.7021781043532968E+05,  5.6764510325100143E+06, -5.5650761736747762E+06,
+               -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06,
+               -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06,
+               -5.6764510325100180E+06, -2.7021781043532834E+05};
+  FLT c6[]  = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06,
+               2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07,
+               8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06,
+               6.9586821127989038E+05, 1.0933249308680584E+05};
+  FLT c7[]  = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05,
+               1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07,
+               1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05,
+               3.6879059542751726E+04, -3.0203516161820367E+04};
+  FLT c8[]  = {5.1670143574922804E+03,  -2.8613147115365118E+04, 4.3560195427108687E+04,
+               4.8438679581840552E+04,  -2.5856630639330545E+05, 3.7994883866097208E+05,
+               -2.5856630640124826E+05, 4.8438679578319818E+04,  4.3560195426824532E+04,
+               -2.8613147115371667E+04, 5.1670143574923577E+03};
+  FLT c9[]  = {3.0888018539742444E+02,  -3.7949446187516196E+03, 1.4313303205035631E+04,
+               -2.6681600236925929E+04, 2.3856005161221132E+04,  -2.3276789125970764E-06,
+               -2.3856005160840708E+04, 2.6681600234072768E+04,  -1.4313303205083184E+04,
+               3.7949446187479048E+03,  -3.0888018539723868E+02};
+  FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02,  4.8528498025870488E+02,
+               -2.5024391115619069E+03, 5.3511195350414373E+03,  -6.7655484152307990E+03,
+               5.3511195328171416E+03,  -2.5024391120801879E+03, 4.8528498023710927E+02,
+               1.1948077481025226E+02,  -8.3747489794331599E+01};
+  FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01,  -2.1597187568776889E+02,
+               3.1511229085836396E+02,  -2.4856618287164540E+02, 1.6489710183426948E-06,
+               2.4856618404233313E+02,  -3.1511228957061689E+02, 2.1597187534632059E+02,
+               -9.0840898568829203E+01, 2.2640047135641577E+01};
+  FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00,  -2.3241017814397217E+01,
+               5.1715493697385526E+01,  -8.2673003927086967E+01, 9.6489715222659115E+01,
+               -8.2673013187251925E+01, 5.1715492855550593E+01,  -2.3241018165160245E+01,
+               7.3325946421432624E+00,  -1.6306382886373367E+00};
+  FLT c13[] = {2.4409286936442823E-01,  -7.8803147249892458E-01, 1.6467143668339987E+00,
+               -2.1898241453519685E+00, 1.6350102449767006E+00,  -1.1782931558589478E-06,
+               -1.6350139430218933E+00, 2.1898230913723329E+00,  -1.6467144225690411E+00,
+               7.8803147709023735E-01,  -2.4409286927983653E-01};
+  for (int i = 0; i < 11; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] +
+                                 z * (c6[i] +
+                                      z * (c7[i] +
+                                           z * (c8[i] +
+                                                z * (c9[i] +
+                                                     z * (c10[i] +
+                                                          z * (c11[i] +
+                                                               z * (c12[i] +
+                                                                    z * (c13[i])))))))))))));
+} else if (w == 12) {
+  FLT c0[]  = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09,
+               7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11,
+               8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10,
+               5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04};
+  FLT c1[]  = {3.2561466099406150E+05,  2.2112758120210624E+08,  8.9911609880089817E+09,
+               8.3059508064200928E+10,  2.3965569143469864E+11,  1.6939286803305209E+11,
+               -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10,
+               -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05};
+  FLT c2[]  = {7.6621098001581465E+05,  2.6026568260310274E+08, 6.4524338253008652E+09,
+               3.3729904113826797E+10,  2.8555202212474010E+10, -6.8998572040731583E+10,
+               -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10,
+               6.4524338253008747E+09,  2.6026568260310277E+08, 7.6621098001583852E+05};
+  FLT c3[]  = {1.0657807616803222E+06,  1.8144472126890999E+08,  2.5524827004349856E+09,
+               5.2112383911371746E+09,  -1.0268350564014614E+10, -1.4763245309081245E+10,
+               1.4763245309081299E+10,  1.0268350564014664E+10,  -5.2112383911371031E+09,
+               -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06};
+  FLT c4[]  = {9.7829638830158743E+05,  8.2222351241519973E+07,  5.5676911894064891E+08,
+               -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09,
+               2.5627633609247112E+09,  -2.7153428193078070E+09, -4.8739037675429451E+08,
+               5.5676911894064677E+08,  8.2222351241519928E+07,  9.7829638830161165E+05};
+  FLT c5[]  = {6.2536876825113979E+05,  2.4702814073680263E+07,  4.1488431554845832E+07,
+               -2.9274790542418414E+08, 1.0742154109193267E+08,  6.2185168968029702E+08,
+               -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08,
+               -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05};
+  FLT c6[]  = {2.8527714307528478E+05,  4.6266378435690189E+06, -1.0665598090791209E+07,
+               -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07,
+               -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07,
+               -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05};
+  FLT c7[]  = {9.2873647411234240E+04, 3.6630046787428786E+05,  -3.1271047224731087E+06,
+               4.8612412939261831E+06, 3.3820440907802135E+06,  -1.6880127953711823E+07,
+               1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06,
+               3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04};
+  FLT c8[]  = {2.0817947751046187E+04,  -5.5660303410280452E+04, -1.9519783923293054E+05,
+               1.0804817251338358E+06,  -1.8264985852948832E+06, 9.7602844964432076E+05,
+               9.7602844962242560E+05,  -1.8264985853129351E+06, 1.0804817251129062E+06,
+               -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04};
+  FLT c9[]  = {2.7986023314784748E+03,  -1.9404411093600604E+04, 4.3922624999853564E+04,
+               -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05,
+               -3.3223441454103496E+05, 1.5273911977621692E+05,  7.6450317497551932E+03,
+               -4.3922624998426982E+04, 1.9404411093646668E+04,  -2.7986023314644040E+03};
+  FLT c10[] = {6.7849020474186844E+01,  -1.7921351307934926E+03, 8.4980694693463538E+03,
+               -1.9742624859078383E+04, 2.4620674878200782E+04,  -1.1676544885779787E+04,
+               -1.1676544871958942E+04, 2.4620674838120303E+04,  -1.9742624835582923E+04,
+               8.4980694640771490E+03,  -1.7921351307934922E+03, 6.7849020488748664E+01};
+  FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02,  4.5513615487589092E+01,
+               -1.1174001343792290E+03, 3.2018769324922364E+03,  -5.0580351333780654E+03,
+               5.0580351424313239E+03,  -3.2018769362383905E+03, 1.1174000937955741E+03,
+               -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01};
+  FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01,  -1.2606964180937365E+02,
+               2.1881091191930210E+02,  -2.3273402308837001E+02, 1.0274273857329082E+02,
+               1.0274268020620094E+02,  -2.3273404553726701E+02, 2.1881091276113446E+02,
+               -1.2606964815819696E+02, 4.6577222438230805E+01,  -1.0538365860846021E+01};
+  FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00,  -9.6946930749915676E+00,
+               2.4990050007153755E+01,  -4.6013920149683365E+01, 6.2056948047986317E+01,
+               -6.2056981293939970E+01, 4.6013908245461884E+01,  -2.4990038356462701E+01,
+               9.6946952377382889E+00,  -2.5969759165384922E+00, 4.6087004737535314E-01};
+  for (int i = 0; i < 12; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] +
+                                 z * (c6[i] +
+                                      z * (c7[i] +
+                                           z * (c8[i] +
+                                                z * (c9[i] +
+                                                     z * (c10[i] +
+                                                          z * (c11[i] +
+                                                               z * (c12[i] +
+                                                                    z * (c13[i])))))))))))));
+} else if (w == 13) {
+  FLT c0[]  = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10,
+               3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12,
+               9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12,
+               3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08,
+               9.8715725867495537E+04};
+  FLT c1[]  = {5.4491110456935503E+05,  5.4903670125539362E+08,  3.0879465445278172E+10,
+               3.9588436413399951E+11,  1.6860562536749778E+12,  2.4256447893117881E+12,
+               3.7318165868693593E-04,  -2.4256447893117856E+12, -1.6860562536749768E+12,
+               -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08,
+               -5.4491110456935491E+05};
+  FLT c2[]  = {1.3504711883426066E+06,  6.9286979077463162E+08,  2.4618123595484562E+10,
+               1.9493985627722598E+11,  3.9422703517046326E+11,  -1.8678883613919931E+11,
+               -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11,
+               1.9493985627722586E+11,  2.4618123595484554E+10,  6.9286979077462578E+08,
+               1.3504711883426069E+06};
+  FLT c3[]  = {1.9937206140846494E+06,  5.2512029493765986E+08,  1.1253303793811754E+10,
+               4.6205527735932175E+10,  -1.1607472377983284E+10, -1.6305241755642325E+11,
+               1.3350300616010507E-04,  1.6305241755642365E+11,  1.1607472377982744E+10,
+               -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08,
+               -1.9937206140846484E+06};
+  FLT c4[]  = {1.9607419630386413E+06, 2.6425362558103889E+08,  3.1171259341747255E+09,
+               2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09,
+               3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10,
+               2.9839860297839398E+09, 3.1171259341747217E+09,  2.6425362558103737E+08,
+               1.9607419630386410E+06};
+  FLT c5[]  = {1.3593773865640301E+06,  9.1556445104158148E+07,  4.7074012944133490E+08,
+               -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09,
+               1.0058570913473114E-03,  -5.2270306737942495E+09, 2.1090780087878082E+09,
+               1.1192579335658059E+09,  -4.7074012944133729E+08, -9.1556445104157895E+07,
+               -1.3593773865640303E+06};
+  FLT c6[]  = {6.8417206432039186E+05,  2.1561705510027003E+07, 7.5785249892988410E+06,
+               -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08,
+               -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08,
+               -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07,
+               6.8417206432039267E+05};
+  FLT c7[]  = {2.5248269397037479E+05,  3.0985559672615193E+06, -1.1816517087617906E+07,
+               -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08,
+               -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07,
+               8.2958498770339396E+06,  1.1816517087613177E+07, -3.0985559672620757E+06,
+               -2.5248269397037491E+05};
+  FLT c8[]  = {6.7530100970876083E+04, 1.2373362326659705E+05,  -2.1245597183259744E+06,
+               5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07,
+               2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06,
+               5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05,
+               6.7530100970875879E+04};
+  FLT c9[]  = {1.2421368748960791E+04,  -5.0576243646949319E+04, -4.8878193435000605E+04,
+               6.5307896868984913E+05,  -1.5497610128277773E+06, 1.5137725915373438E+06,
+               2.4159142842753925E-04,  -1.5137725925842635E+06, 1.5497610128277773E+06,
+               -6.5307896858028776E+05, 4.8878193437283131E+04,  5.0576243646456518E+04,
+               -1.2421368748960884E+04};
+  FLT c10[] = {1.2904654687546160E+03,  -1.1169946055063081E+04, 3.3275109714208906E+04,
+               -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05,
+               -3.1083591689740209E+05, 2.2355863472015061E+05,  -5.9810982676856896E+04,
+               -3.1765222445615127E+04, 3.3275109711790254E+04,  -1.1169946054458416E+04,
+               1.2904654687550794E+03};
+  FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03,
+               -1.2165497337805051E+04, 1.9423733200245264E+04,  -1.6010024156865491E+04,
+               -1.8587318864580292E-04, 1.6010021504569266E+04,  -1.9423732997327170E+04,
+               1.2165497443946821E+04,  -4.2702512314786209E+03, 6.8296542157807858E+02,
+               1.9043622268681840E+01};
+  FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01,  -9.7437039087669007E+01,
+               -3.5079927282955276E+02, 1.5699250476860170E+03,  -3.1287441993042225E+03,
+               3.8692185175061472E+03,  -3.1287462825609659E+03, 1.5699252631952513E+03,
+               -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01,
+               -3.0093984466256714E+01};
+  FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01,  -6.4297196365104938E+01,
+               1.2922885252832501E+02,  -1.6991814421468084E+02, 1.2655005406584399E+02,
+               -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02,
+               -1.2922893349406868E+02, 6.4297198822227926E+01,  -2.1108976183295965E+01,
+               4.3050286010617569E+00};
+  FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01,  -3.4300810538238449E+00,
+               1.0470062030552395E+01,  -2.2292087310650142E+01, 3.4570674930666925E+01,
+               -3.9923385381532697E+01, 3.4573472104415345E+01,  -2.2292369892227434E+01,
+               1.0470053799441445E+01,  -3.4300825281782954E+00, 7.2949352704193948E-01,
+               -1.0957333730383595E-01};
+  for (int i = 0; i < 13; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] +
+                                 z * (c6[i] +
+                                      z * (c7[i] +
+                                           z * (c8[i] +
+                                                z * (c9[i] +
+                                                     z * (c10[i] +
+                                                          z * (c11[i] +
+                                                               z * (c12[i] +
+                                                                    z * (c13[i] +
+                                                                         z * (c14[i]))))))))))))));
+} else if (w == 14) {
+  FLT c0[]  = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10,
+               1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13,
+               8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13,
+               1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10,
+               4.4723032442444843E+08, 1.5499533202970150E+05};
+  FLT c1[]  = {8.9188339002980455E+05,  1.3065352538728631E+09,  9.9400185225815582E+10,
+               1.7136059013402410E+12,  1.0144146621675832E+13,  2.3034036018490723E+13,
+               1.4630967270448867E+13,  -1.4630967270448859E+13, -2.3034036018490715E+13,
+               -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10,
+               -1.3065352538728662E+09, -8.9188339002979524E+05};
+  FLT c2[]  = {2.3170473769379673E+06,  1.7532505043698246E+09,  8.6523535958354294E+10,
+               9.7455289065487329E+11,  3.2977972139362295E+12,  1.7874626001697771E+12,
+               -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12,
+               3.2977972139362256E+12,  9.7455289065487366E+11,  8.6523535958354599E+10,
+               1.7532505043698282E+09,  2.3170473769380408E+06};
+  FLT c3[]  = {3.6089249230396431E+06,  1.4278058213962190E+09,  4.4296625537022438E+10,
+               2.9466624630419812E+11,  3.1903621584503357E+11,  -9.8834691411254529E+11,
+               -1.1072264714919219E+12, 1.1072264714919253E+12,  9.8834691411255261E+11,
+               -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10,
+               -1.4278058213962216E+09, -3.6089249230396645E+06};
+  FLT c4[]  = {3.7733555140851741E+06,  7.8376718099107432E+08,  1.4443117772349600E+10,
+               4.3197433307419121E+10,  -7.6585042240582489E+10, -1.8569640140761731E+11,
+               2.0385335192658878E+11,  2.0385335192657968E+11,  -1.8569640140762405E+11,
+               -7.6585042240578430E+10, 4.3197433307418945E+10,  1.4443117772349699E+10,
+               7.8376718099107552E+08,  3.7733555140852556E+06};
+  FLT c5[]  = {2.8079157920112349E+06,  3.0340753492383713E+08,  2.9498136661747351E+09,
+               -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10,
+               4.0682590266889229E+10,  -4.0682590266876595E+10, -1.5217518660581694E+10,
+               2.2372008390624306E+10,  6.2820200387922049E+08,  -2.9498136661747746E+09,
+               -3.0340753492383796E+08, -2.8079157920112382E+06};
+  FLT c6[]  = {1.5361613559533113E+06,  8.3513615594416350E+07,  3.0077547202707732E+08,
+               -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09,
+               -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09,
+               -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08,
+               8.3513615594416887E+07,  1.5361613559533583E+06};
+  FLT c7[]  = {6.2759409419592936E+05,  1.5741723594963074E+07, -1.5632610223404476E+07,
+               -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07,
+               -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07,
+               -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07,
+               -1.5741723594962660E+07, -6.2759409419590654E+05};
+  FLT c8[]  = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06,
+               4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08,
+               6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08,
+               5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06,
+               1.7156606891561027E+06, 1.9151404903936631E+05};
+  FLT c9[]  = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06,
+               4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06,
+               1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06,
+               3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06,
+               2.2565910609040961E+03, -4.2715272622820310E+04};
+  FLT c10[] = {6.4806786522791654E+03,  -3.5474227032931303E+04, 1.8237100723206047E+04,
+               3.0934714627485734E+05,  -1.0394703921956274E+06, 1.4743920336239333E+06,
+               -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06,
+               -1.0394703928590287E+06, 3.0934714634119731E+05,  1.8237100680361433E+04,
+               -3.5474227032996088E+04, 6.4806786523011797E+03};
+  FLT c11[] = {4.9913632908432180E+02,  -5.5416668526903932E+03, 2.0614058707628108E+04,
+               -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05,
+               -2.2569743818692098E+05, 2.2569743267254104E+05,  -1.1559000606061178E+05,
+               5.3099530192621614E+03,  3.2285139062955688E+04,  -2.0614058671415001E+04,
+               5.5416668535488525E+03,  -4.9913632906175445E+02};
+  FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03,
+               -6.3715702906684537E+03, 1.2525623712293716E+04,  -1.4199809613604592E+04,
+               6.4441857815348694E+03,  6.4441852068443368E+03,  -1.4199811050333730E+04,
+               1.2525626046977848E+04,  -6.3715705510753096E+03, 1.8160422724294601E+03,
+               -1.8970588700494130E+02, -3.3076333169380085E+01};
+  FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01,  -1.0101141802233408E+02,
+               -3.2954042015367456E+01, 6.1417873351558330E+02,  -1.6177281811377129E+03,
+               2.4593356854220169E+03,  -2.4593356782637338E+03, 1.6177289006539679E+03,
+               -6.1417987494681950E+02, 3.2954142200289709E+01,  1.0101142888658896E+02,
+               -5.7000698890466253E+01, 1.4394533639134110E+01};
+  FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00,  -2.8993517494090959E+01,
+               6.6373419665690747E+01,  -1.0329523947888029E+02, 1.0280172537525394E+02,
+               -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02,
+               -1.0329529425338121E+02, 6.6373405476301841E+01,  -2.8993535416845578E+01,
+               8.5113925602355138E+00,  -1.5925952196632756E+00};
+  FLT c15[] = {1.5984868375087002E-02,  1.2876155307218357E-01,  -9.8359379953002779E-01,
+               3.7711056267887488E+00,  -9.4307026856950991E+00, 1.6842022255882348E+01,
+               -2.2310401016395307E+01, 2.2307954998498516E+01,  -1.6843279237301534E+01,
+               9.4308852877255891E+00,  -3.7711056267887488E+00, 9.8361025494556609E-01,
+               -1.2876093931172500E-01, -1.5984859319657936E-02};
+  for (int i = 0; i < 14; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] +
+                                 z * (c6[i] +
+                                      z * (c7[i] +
+                                           z * (c8[i] +
+                                                z * (c9[i] +
+                                                     z * (c10[i] +
+                                                          z * (c11[i] +
+                                                               z * (c12[i] +
+                                                                    z * (c13[i] +
+                                                                         z * (c14[i] +
+                                                                              z * (c15[i])))))))))))))));
+} else if (w == 15) {
+  FLT c0[]  = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11,
+               4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14,
+               7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14,
+               2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12,
+               1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05};
+  FLT c1[]  = {1.4314487885226030E+06,  2.9961416925358467E+09,  3.0273361232748438E+11,
+               6.8507333793903594E+12,  5.4192702756911008E+13,  1.7551587948105312E+14,
+               2.1874615668430153E+14,  7.1650878467891699E-02,  -2.1874615668430153E+14,
+               -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12,
+               -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06};
+  FLT c2[]  = {3.8829497354762917E+06,  4.2473082696966438E+09,  2.8414312556015527E+11,
+               4.3688281331121411E+12,  2.1823119508000523E+13,  3.2228098609392012E+13,
+               -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13,
+               3.2228098609392000E+13,  2.1823119508000590E+13,  4.3688281331121475E+12,
+               2.8414312556015521E+11,  4.2473082696966453E+09,  3.8829497354762908E+06};
+  FLT c3[]  = {6.3495763451755773E+06,  3.6841035003733954E+09,  1.5965774278321045E+11,
+               1.5630338683778196E+12,  3.8749058615819282E+12,  -2.7319740087723496E+12,
+               -1.3233342822865416E+13, 1.2094759019991106E-03,  1.3233342822865408E+13,
+               2.7319740087723706E+12,  -3.8749058615819390E+12, -1.5630338683778196E+12,
+               -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06};
+  FLT c4[]  = {7.0146619045520434E+06,  2.1782897863065772E+09, 5.8897780310148148E+10,
+               3.1953009601770453E+11,  4.0651527030852091E+08, -1.6379148273275527E+12,
+               -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11,
+               -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11,
+               5.8897780310148087E+10,  2.1782897863065767E+09, 7.0146619045520416E+06};
+  FLT c5[]  = {5.5580012413990181E+06,  9.2345162185944223E+08,  1.4522950934020067E+10,
+               2.7025952371212223E+10,  -1.2304576967641710E+11, -1.0116752717201025E+11,
+               3.8517418245457495E+11,  1.1720185410178396E-01,  -3.8517418245448737E+11,
+               1.0116752717220248E+11,  1.2304576967643900E+11,  -2.7025952371215157E+10,
+               -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06};
+  FLT c6[]  = {3.2693972344231787E+06,  2.8610260147425157E+08,  2.2348528403750129E+09,
+               -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10,
+               1.9879262560041798E+10,  -6.6148013553832657E+10, 1.9879262560029728E+10,
+               3.1608597465497307E+10,  -1.7480626463581020E+10, -3.4574515574192748E+09,
+               2.2348528403750839E+09,  2.8610260147425318E+08,  3.2693972344231806E+06};
+  FLT c7[]  = {1.4553539959296260E+06,  6.4136842048383795E+07,  1.3622336582061595E+08,
+               -1.2131510424646864E+09, 6.4322366984170294E+08,  4.5078753872136936E+09,
+               -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09,
+               -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09,
+               -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06};
+  FLT c8[]  = {4.9358776531681529E+05,  9.7772970960589685E+06, -2.3511574237970300E+07,
+               -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08,
+               -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08,
+               -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08,
+               -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05};
+  FLT c9[]  = {1.2660319987326673E+05, 7.7519511328176421E+05,  -6.5244610661542173E+06,
+               9.0878257489026226E+06, 2.3116605620370809E+07,  -8.7079594480778053E+07,
+               9.5542733720576629E+07, 4.2723164545317951E-02,  -9.5542733670714036E+07,
+               8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06,
+               6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05};
+  FLT c10[] = {2.3793325531458449E+04,  -4.2305332803592217E+04, -5.2884156986641441E+05,
+               2.5307340140247596E+06,  -4.0404175229102052E+06, -1.7519991511035681E+05,
+               1.0146438775036881E+07,  -1.5828545434039038E+07, 1.0146438771144925E+07,
+               -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06,
+               -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04};
+  FLT c11[] = {2.9741655196842516E+03,  -2.0687056404176896E+04, 3.3295507782231041E+04,
+               1.0661145714339131E+05,  -5.6644238113375264E+05, 1.0874811579280477E+06,
+               -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05,
+               -1.0874812519522079E+06, 5.6644242684715183E+05,  -1.0661145918131116E+05,
+               -3.3295507839673090E+04, 2.0687056403552484E+04,  -2.9741655196846054E+03};
+  FLT c12[] = {1.5389176594851995E+02,  -2.3864418514303975E+03, 1.0846266940782971E+04,
+               -2.2940053288728755E+04, 1.4780109856545603E+04,  4.2663625334078126E+04,
+               -1.3047651001642903E+05, 1.7468402233671257E+05,  -1.3047651921148783E+05,
+               4.2663543727874072E+04,  1.4780033422571960E+04,  -2.2940053360564565E+04,
+               1.0846266911599001E+04,  -2.3864418523423406E+03, 1.5389176594715920E+02};
+  FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02,
+               -2.8648428618202479E+03, 6.8249256924540387E+03,  -9.7944454945500202E+03,
+               7.6177717113307281E+03,  1.2047808031005401E-02,  -7.6177543637173221E+03,
+               9.7944303211006554E+03,  -6.8249067869823548E+03, 2.8648410033462715E+03,
+               -6.4183084900019139E+02, 1.9651606442715156E+01,  2.3857631312384541E+01};
+  FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01,  -6.5819949282243059E+01,
+               5.1366943137229264E+01,  1.7214074364107390E+02,  -6.9658313160417026E+02,
+               1.3192072946885612E+03,  -1.6053709652649356E+03, 1.3192033489278531E+03,
+               -6.9663899461741221E+02, 1.7211498258980890E+02,  5.1367587332701412E+01,
+               -6.5819942079787495E+01, 2.7872915852722411E+01,  -6.1348505745937754E+00};
+  FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00,  -1.1650680501534040E+01,
+               3.0081518778147480E+01,  -5.4027643304315461E+01, 6.6072752684824721E+01,
+               -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01,
+               -6.6050534688928863E+01, 5.4059169757207428E+01,  -3.0081909461561551E+01,
+               1.1650669885136919E+01,  -3.0617550621683702E+00, 4.9671584460032286E-01};
+  FLT c16[] = {4.3460787769280373E-03,  -1.3199805974685097E-02, -1.9413550415167488E-01,
+               1.1330353009743728E+00,  -3.4412627904689330E+00, 7.1628360506506050E+00,
+               -1.1104833360853762E+01, 1.2402582581952625E+01,  -1.1114919494696498E+01,
+               7.0930736249049993E+00,  -3.4864402649728556E+00, 1.1323392526753271E+00,
+               -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03};
+  for (int i = 0; i < 15; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] +
+                                 z * (c6[i] +
+                                      z * (c7[i] +
+                                           z * (c8[i] +
+                                                z * (c9[i] +
+                                                     z * (c10[i] +
+                                                          z * (c11[i] +
+                                                               z * (c12[i] +
+                                                                    z * (c13[i] +
+                                                                         z * (c14[i] +
+                                                                              z * (c15[i] +
+                                                                                   z * (c16[i]))))))))))))))));
+} else if (w == 16) {
+  FLT c0[]  = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11,
+               1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15,
+               4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15,
+               4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14,
+               1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09,
+               3.6434551345570857E+05};
+  FLT c1[]  = {2.2576246485480363E+06,  6.6499571180086451E+09,  8.7873753526056287E+11,
+               2.5606844387131055E+13,  2.6313738449330159E+14,  1.1495095100701462E+15,
+               2.1932582707747572E+15,  1.2860244365132600E+15,  -1.2860244365132588E+15,
+               -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14,
+               -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09,
+               -2.2576246485480368E+06};
+  FLT c2[]  = {6.3730995546265068E+06, 9.9060026035198040E+09,  8.8097248605448987E+11,
+               1.7953384130753676E+13, 1.2398425545001648E+14,  3.0749346493041212E+14,
+               1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14,
+               1.0259777520247070E+14, 3.0749346493041225E+14,  1.2398425545001656E+14,
+               1.7953384130753684E+13, 8.8097248605449011E+11,  9.9060026035198078E+09,
+               6.3730995546265068E+06};
+  FLT c3[]  = {1.0896915393078225E+07,  9.0890343524593887E+09,  5.3565169504010028E+11,
+               7.3004206720038701E+12,  2.9692333044160082E+13,  1.6051737468109645E+13,
+               -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13,
+               9.1273329108090062E+13,  -1.6051737468109594E+13, -2.9692333044160074E+13,
+               -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09,
+               -1.0896915393078221E+07};
+  FLT c4[]  = {1.2655725616100587E+07,  5.7342804054544201E+09,  2.1822836608899588E+11,
+               1.8300700858999731E+12,  2.7770431049858564E+12,  -8.5034969223847109E+12,
+               -1.2846668467422201E+13, 1.6519076896574611E+13,  1.6519076896573730E+13,
+               -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12,
+               1.8300700858999692E+12,  2.1822836608899588E+11,  5.7342804054544220E+09,
+               1.2655725616100591E+07};
+  FLT c5[]  = {1.0609303958036324E+07,  2.6255609052371726E+09,  6.1673589426039383E+10,
+               2.6044432099084976E+11,  -3.5431628074578320E+11, -1.6077602129636006E+12,
+               1.5534405614729011E+12,  2.8019935380861670E+12,  -2.8019935380844810E+12,
+               -1.5534405614727644E+12, 1.6077602129636335E+12,  3.5431628074576636E+11,
+               -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09,
+               -1.0609303958036324E+07};
+  FLT c6[]  = {6.6544809363384563E+06, 8.9490403680928385E+08,  1.1882638725190760E+10,
+               8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10,
+               3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11,
+               3.9453789461951154E+11, 2.7074695076007500E+10,  -1.2575562817885344E+11,
+               8.1552898137852116E+09, 1.1882638725191153E+10,  8.9490403680928493E+08,
+               6.6544809363384582E+06};
+  FLT c7[]  = {3.1906872142824987E+06,  2.2785946180651781E+08,  1.3744578972809656E+09,
+               -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10,
+               -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10,
+               9.4227043392705956E+09,  -3.4690551712022408E+10, 9.2011130753675175E+09,
+               4.3997172592866106E+09,  -1.3744578972812984E+09, -2.2785946180652174E+08,
+               -3.1906872142824973E+06};
+  FLT c8[]  = {1.1821527096621725E+06,  4.2281234059839047E+07, 2.8723226058821958E+07,
+               -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09,
+               -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09,
+               -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09,
+               -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07,
+               1.1821527096621748E+06};
+  FLT c9[]  = {3.3854610744280228E+05,  5.2176984975098642E+06, -2.0677283564981934E+07,
+               -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08,
+               -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08,
+               1.3426914452369988E+08,  3.7992777987329507E+08, -2.6599346107659298E+08,
+               3.5831818968129277E+07,  2.0677283565073237E+07, -5.2176984975084374E+06,
+               -3.3854610744280077E+05};
+  FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05,  -3.6415998560880083E+06,
+               8.4025485863333493E+06, 4.9278860779347531E+06,  -5.1437033824108891E+07,
+               8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07,
+               8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06,
+               8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05,
+               7.3893334077307591E+04};
+  FLT c11[] = {1.1778892113374410E+04,  -4.0077190109195144E+04, -1.8372552183899941E+05,
+               1.3262878359201169E+06,  -2.9738540144900386E+06, 1.9493508843214174E+06,
+               4.1881949043266159E+06,  -1.1066749441324197E+07, 1.1066749225224417E+07,
+               -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06,
+               -1.3262878392766861E+06, 1.8372552166916840E+05,  4.0077190106541901E+04,
+               -1.1778892113374635E+04};
+  FLT c12[] = {1.2019749667905517E+03,  -1.0378455845905968E+04, 2.6333352626226591E+04,
+               1.7117060824677988E+04,  -2.5133287788479996E+05, 6.4713912423136400E+05,
+               -8.1634971996757365E+05, 3.8623850687193515E+05,  3.8623887467457692E+05,
+               -8.1634999581952032E+05, 6.4713888515965885E+05,  -2.5133289397614688E+05,
+               1.7117056658162492E+04,  2.6333352590306949E+04,  -1.0378455846607170E+04,
+               1.2019749667886601E+03};
+  FLT c13[] = {3.1189837633271310E+01,  -8.9083493666530228E+02, 4.9454294721013366E+03,
+               -1.3124691362129612E+04, 1.5834782149156119E+04,  6.9607783053915546E+03,
+               -5.9789949050326162E+04, 1.0841720290002371E+05,  -1.0841726183381994E+05,
+               5.9790023686287932E+04,  -6.9607416211385053E+03, -1.5834800728954084E+04,
+               1.3124692508510609E+04,  -4.9454294244132070E+03, 8.9083493795553227E+02,
+               -3.1189837630675466E+01};
+  FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01,  1.7684013462935113E+02,
+               -1.1059907069976271E+03, 3.1998196269059799E+03,  -5.5988285845467362E+03,
+               5.9248624962359208E+03,  -2.5987075415506133E+03, -2.5989297031998472E+03,
+               5.9249309327755627E+03,  -5.5988287659129119E+03, 3.1998292347735460E+03,
+               -1.1059914993060199E+03, 1.7684017599586443E+02,  1.8283697951655380E+01,
+               -1.2975319075406015E+01};
+  FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01,  -3.4150613932459848E+01,
+               4.8896713096147266E+01,  1.5844216816345641E+01,  -2.4277080939345015E+02,
+               6.0146058115394737E+02,  -8.8748160721868635E+02, 8.8732832343048744E+02,
+               -6.0146927810646923E+02, 2.4275722040513463E+02,  -1.5849652411671842E+01,
+               -4.8897528435446198E+01, 3.4150596946224454E+01,  -1.1938504032584051E+01,
+               2.3155118728820292E+00};
+  FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01,  -4.1901188293318530E+00,
+               1.2150691895619683E+01,  -2.4764820628534302E+01, 3.6081462800085532E+01,
+               -3.4534922277532473E+01, 1.2910251318703700E+01,  1.3098525817101535E+01,
+               -3.4588714991360455E+01, 3.5973877372429698E+01,  -2.4775747273530602E+01,
+               1.2149010873312557E+01,  -4.1901467369287460E+00, 9.8067700766883559E-01,
+               -1.5401723876450651E-01};
+  FLT c17[] = {1.1808835457017667E-02,  -2.5443945538745794E-02, -1.3157119144786456E-04,
+               2.5877310634925382E-01,  -1.0920774586473376E+00, 2.6473618304294715E+00,
+               -4.4448325935254926E+00, 6.8292491990998831E+00,  -6.8300632710034588E+00,
+               4.4643703192113184E+00,  -2.6384070394901351E+00, 1.0890246890089277E+00,
+               -2.5849326913239973E-01, 1.4031610447463365E-04,  2.5444280926035151E-02,
+               -1.1808834729180664E-02};
+  for (int i = 0; i < 16; i++)
+    ker[i] =
+        c0[i] +
+        z * (c1[i] +
+             z * (c2[i] +
+                  z * (c3[i] +
+                       z * (c4[i] +
+                            z * (c5[i] +
+                                 z * (c6[i] +
+                                      z * (c7[i] +
+                                           z * (c8[i] +
+                                                z * (c9[i] +
+                                                     z * (c10[i] +
+                                                          z * (c11[i] +
+                                                               z * (c12[i] +
+                                                                    z * (c13[i] +
+                                                                         z * (c14[i] +
+                                                                              z * (c15[i] +
+                                                                                   z * (c16[i] +
+                                                                                        z * (c17[i])))))))))))))))));
+} else
+  printf("width not implemented!\n");
diff --git a/devel/test_ker_ppval.cpp b/devel/test_ker_ppval.cpp
index e8089121e..35d29d645 100644
--- a/devel/test_ker_ppval.cpp
+++ b/devel/test_ker_ppval.cpp
@@ -4,7 +4,8 @@
 For dyn linked:
 g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp
 For statically linked so can control glibc (avoid Matlab calling being different):
-g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp -static -lmvec
+g++-9 test_ker_ppval.cpp -o test_ker_ppval -O3 -funroll-loops -march=native -fopenmp
+-static -lmvec
 
 For GCC vectorization info: -fopt-info
 
@@ -29,46 +30,43 @@ I have even seen 1e-7 error for w=12 (which should be good to 1e-11)
 
 Demo that sscanf for w can speed plain eval but slow some magic horner speed:
 
-alex@fiona /home/alex/numerics/finufft/devel> g++-7 test_ker_ppval.cpp -o test_ker_ppval -Ofast -march=native -funroll-loops -fopenmp
-WITHOUT SSCANF FOR w:
-alex@fiona /home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000
-acc test: sup err scaled to kernel peak of 1: 6.53e-11
-exp(sqrt): M=10000000 w=12 in 1.03 s:	116 Meval/s (ans=2.73717868002952e+19)
-Horner:    M=10000000 w=12 in 0.0812 s:	1.48e+03 Meval/s (ans=2.73717867964406e+19)
-rel err in sum = 1.41e-10
-WITH SSCANF FOR w:
-alex@fiona /home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000
-acc test: sup err scaled to kernel peak of 1: 6.53e-11
-exp(sqrt): M=10000000 w=12 in 0.45 s:	267 Meval/s (ans=2.73717867952762e+19)
-Horner:    M=10000000 w=12 in 0.483 s:	248 Meval/s (ans=2.73717867952754e+19)
-rel err in sum = 3.01e-14
+alex@fiona /home/alex/numerics/finufft/devel> g++-7 test_ker_ppval.cpp -o test_ker_ppval
+-Ofast -march=native -funroll-loops -fopenmp WITHOUT SSCANF FOR w: alex@fiona
+/home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000 acc test: sup err scaled to
+kernel peak of 1: 6.53e-11 exp(sqrt): M=10000000 w=12 in 1.03 s:	116 Meval/s
+(ans=2.73717868002952e+19) Horner:    M=10000000 w=12 in 0.0812 s:	1.48e+03 Meval/s
+(ans=2.73717867964406e+19) rel err in sum = 1.41e-10 WITH SSCANF FOR w: alex@fiona
+/home/alex/numerics/finufft/devel> ./test_ker_ppval 10000000 acc test: sup err scaled to
+kernel peak of 1: 6.53e-11 exp(sqrt): M=10000000 w=12 in 0.45 s:	267 Meval/s
+(ans=2.73717867952762e+19) Horner:    M=10000000 w=12 in 0.483 s:	248 Meval/s
+(ans=2.73717867952754e+19) rel err in sum = 3.01e-14
 
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <time.h>
+#include <vector>
 
 // Choose prec... (w=7 enough for single)
 typedef double FLT;
-//typedef float FLT;
+// typedef float FLT;
 
-static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta, const FLT c, const int w)
+static inline void evaluate_kernel_vector(FLT *ker, const FLT *args, const FLT beta,
+                                          const FLT c, const int w)
 /* Evaluate kernel for a vector of w arguments, must also be the int width par.
    The #pragra's need to be removed for icpc if -fopenmp not used; g++ is ok.
  */
 {
 #pragma omp simd
-  for (int i = 0; i < w; i++)
-    ker[i] = exp(beta * sqrt((FLT)1.0 - c*args[i]*args[i]));
-  // gcc 5.4 can't simd the combined loop, hence we split the
-  // out-of-support test to subsequent loop...
-  // This check loop prevents getting 0.2s (600 Meval/s):
+  for (int i = 0; i < w; i++) ker[i] = exp(beta * sqrt((FLT)1.0 - c * args[i] * args[i]));
+    // gcc 5.4 can't simd the combined loop, hence we split the
+    // out-of-support test to subsequent loop...
+    // This check loop prevents getting 0.2s (600 Meval/s):
 #pragma omp simd
   for (int i = 0; i < w; i++)
-    if (fabs(args[i]) >= (FLT)w/2)    // note fabs not abs!
+    if (fabs(args[i]) >= (FLT)w / 2) // note fabs not abs!
       ker[i] = 0.0;
 }
 
@@ -78,79 +76,79 @@ static inline void kernel_vector_Horner(FLT *ker, FLT z, int w)
    See: gen_all_horner_C_code.m
 */
 {
-  //#include "../src/ker_horner_allw.c"
+  // #include "../src/ker_horner_allw.c"
 #include "../src/ker_horner_allw_loop.c"
 }
 
-int main(int argc, char* argv[])
-{
-  int M = (int) 1e7;          // # of reps (<2^31)
-  if (argc>1)
-    sscanf(argv[1],"%d",&M);  // weirdly allows exp simd 10x faster, even on gcc 5.4.0
-  int w=12;                   // spread width
-  if (argc>2)
-    sscanf(argv[2],"%d",&w);  // prevents the magic 0.2s, keeps at 0.4s
-  FLT beta=2.30*w;            // should match kernel params for acc test
-  if (w==2) beta = 2.20*w;
-  if (w==3) beta = 2.26*w;
-  if (w==4) beta = 2.38*w;
-  FLT c = 4.0/(FLT)(w*w);          // set up ker params for plain eval
-  FLT iw = 1.0/(FLT)w;        // scale factor
+int main(int argc, char *argv[]) {
+  int M = (int)1e7;                        // # of reps (<2^31)
+  if (argc > 1)
+    sscanf(argv[1], "%d", &M);             // weirdly allows exp simd 10x faster, even on
+                                           // gcc 5.4.0
+  int w = 12;                              // spread width
+  if (argc > 2) sscanf(argv[2], "%d", &w); // prevents the magic 0.2s, keeps at 0.4s
+  FLT beta = 2.30 * w;                     // should match kernel params for acc test
+  if (w == 2) beta = 2.20 * w;
+  if (w == 3) beta = 2.26 * w;
+  if (w == 4) beta = 2.38 * w;
+  FLT c  = 4.0 / (FLT)(w * w);               // set up ker params for plain eval
+  FLT iw = 1.0 / (FLT)w;                     // scale factor
   std::vector<FLT> x(w);
-  std::vector<FLT> f(16), f2(16); // length=MAX_NSPREAD
+  std::vector<FLT> f(16), f2(16);            // length=MAX_NSPREAD
 
-  int Macc = 100;        // test accuracy.......
+  int Macc   = 100;                          // test accuracy.......
   FLT superr = 0.0;
-  for (int i=0;i<Macc;++i) {       // loop over eval grid sets
-    FLT z = (2*i)/(FLT)(Macc-1)-1.0;  // local offset sweep through z in [-1,1]
-    //printf("z=%g:\n",z);   // useful for calling w/ eg Macc=3
-    kernel_vector_Horner(&f2[0],z,w);   // eval kernel to f2, given offset z
-    for (int j=0;j<w;++j)           // vector of args in [-w/2,w/2] ker supp
-      x[j] = (-(FLT)w+1.0+z)/2 + j;
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j) {
-      //printf("x=%.3g\tf=%.6g\tf2=%.6g\tf2-f=%.3g\n",x[j],f[j],f2[j],f2[j]-f[j]);
-      FLT err = abs(f[j]-f2[j]);
-      if (err>superr) superr = err;
+  for (int i = 0; i < Macc; ++i) {           // loop over eval grid sets
+    FLT z = (2 * i) / (FLT)(Macc - 1) - 1.0; // local offset sweep through z in [-1,1]
+    // printf("z=%g:\n",z);   // useful for calling w/ eg Macc=3
+    kernel_vector_Horner(&f2[0], z, w); // eval kernel to f2, given offset z
+    for (int j = 0; j < w; ++j)         // vector of args in [-w/2,w/2] ker supp
+      x[j] = (-(FLT)w + 1.0 + z) / 2 + j;
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) {
+      // printf("x=%.3g\tf=%.6g\tf2=%.6g\tf2-f=%.3g\n",x[j],f[j],f2[j],f2[j]-f[j]);
+      FLT err = abs(f[j] - f2[j]);
+      if (err > superr) superr = err;
     }
   }
   superr /= exp(beta);
-  printf("acc test: sup err scaled to kernel peak of 1: %.3g\n",superr);
-  
+  printf("acc test: sup err scaled to kernel peak of 1: %.3g\n", superr);
+
   // test speed...... plain eval
-  clock_t start=clock();
-  FLT ans = 0.0;                     // dummy answer
-  for (int i=0;i<M;++i) {            // loop over eval grid sets
-    FLT xi = -w/2.0 + i/(FLT)(M-1);  // offset in [-w/2, -w/2+1]
-    for (int j=0;j<w;++j)
-     x[j] = xi + (FLT)j;            // vector of args for [-w/2,w/2] ker supp
-    evaluate_kernel_vector(&f[0],&x[0],beta,c,w);   // eval kernel into f
-    for (int j=0;j<w;++j) {
+  clock_t start = clock();
+  FLT ans       = 0.0;                    // dummy answer
+  for (int i = 0; i < M; ++i) {           // loop over eval grid sets
+    FLT xi = -w / 2.0 + i / (FLT)(M - 1); // offset in [-w/2, -w/2+1]
+    for (int j = 0; j < w; ++j)
+      x[j] = xi + (FLT)j;                 // vector of args for [-w/2,w/2] ker supp
+    evaluate_kernel_vector(&f[0], &x[0], beta, c, w); // eval kernel into f
+    for (int j = 0; j < w; ++j) {
       // printf("x=%.16g\tf=%.16g\n",x[j],f[j]);
-      ans += f[j];                   // do something cheap to use all f outputs
+      ans += f[j]; // do something cheap to use all f outputs
     }
   }
-  double t=(double)(clock()-start)/CLOCKS_PER_SEC;
-  printf("exp(sqrt): M=%d w=%d in %.3g s:\t%.3g Meval/s (ans=%.15g)\n",M,w,t,M*w/(t*1.0e6),ans);
-  
+  double t = (double)(clock() - start) / CLOCKS_PER_SEC;
+  printf("exp(sqrt): M=%d w=%d in %.3g s:\t%.3g Meval/s (ans=%.15g)\n", M, w, t,
+         M * w / (t * 1.0e6), ans);
+
   // test speed...... Horner (on same set as above, so can check its sum)
-  start=clock();
-  FLT ans2 = 0.0;                    // dummy answer
-  for (int i=0;i<M;++i) {            // loop over eval grid sets
-    FLT z = (2*i)/(FLT)(M-1)-1.0;    // local offset sweep through z in [-1,1]
-    kernel_vector_Horner(&f[0],z,w); // eval kernel to f, given offset z
-    for (int j=0;j<w;++j)
-      ans2 += f[j];                  // do something cheap to use all f outputs
-    }
-  double t2=(double)(clock()-start)/CLOCKS_PER_SEC;
-  printf("Horner:    M=%d w=%d in %.3g s:\t%.3g Meval/s (ans=%.15g)\n",M,w,t2,M*w/(t2*1.0e6),ans2);
+  start    = clock();
+  FLT ans2 = 0.0;                             // dummy answer
+  for (int i = 0; i < M; ++i) {               // loop over eval grid sets
+    FLT z = (2 * i) / (FLT)(M - 1) - 1.0;     // local offset sweep through z in [-1,1]
+    kernel_vector_Horner(&f[0], z, w);        // eval kernel to f, given offset z
+    for (int j = 0; j < w; ++j) ans2 += f[j]; // do something cheap to use all f outputs
+  }
+  double t2 = (double)(clock() - start) / CLOCKS_PER_SEC;
+  printf("Horner:    M=%d w=%d in %.3g s:\t%.3g Meval/s (ans=%.15g)\n", M, w, t2,
+         M * w / (t2 * 1.0e6), ans2);
 
-  printf("rel err in sum = %.3g\n",fabs(ans-ans2)/fabs(ans));  // another acc
+  printf("rel err in sum = %.3g\n", fabs(ans - ans2) / fabs(ans)); // another acc
 
   // append timing data to tmp file...
-  FILE *p = fopen("/tmp/test_ker_ppval.dat","a");
-  fprintf(p,"%d %d %.3f %.3f %.3g\n",M,w,t,t2,superr);
+  FILE *p = fopen("/tmp/test_ker_ppval.dat", "a");
+  fprintf(p, "%d %d %.3f %.3f %.3g\n", M, w, t, t2, superr);
   fclose(p);
-  
+
   return 0;
 }
diff --git a/devel/time2d2interp.cpp b/devel/time2d2interp.cpp
index de1199a16..146c11808 100644
--- a/devel/time2d2interp.cpp
+++ b/devel/time2d2interp.cpp
@@ -1,5 +1,5 @@
 /* deterministic speed test for 2d2 interpolation, without wrapping effects.
-   
+
    g++ time2d2interp.cpp -o time2d2interp -Ofast; OMP_NUM_THREADS=1 ./time2d2interp
 
    If ns=10 statically defined in code:
@@ -8,7 +8,7 @@
    xeon gcc 7.3 -O3:                    0.88 s
    xeon gcc 7.3 -O2:                    1.2 s
    xeon gcc 7.3:                        8.6 s
-   
+
    if ns=10 read from argv:
    xeon gcc 7.3 -Ofast:                 1.0 s
    xeon gcc 7.3 -Ofast -march=native:   1.4 s
@@ -21,69 +21,69 @@
    Barnett 5/1/18
 */
 
-#include <vector>
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <time.h>
+#include <vector>
 
 // Choose prec for floating pt...
 typedef double FLT;
 
 #define MAXNS 16
 
-int main(int argc, char *argv[])
-{
-  int M = 10000000;        // NU pts
-  int n = 2000;           // U grid pts per dimension (needn't be huge)
-  if (argc>1)
-    sscanf(argv[1],"%d",&M);
-  if (argc>2)
-    sscanf(argv[2],"%d",&n);
-  int ns=10;           // kernel width
-  if (argc>3)
-    sscanf(argv[3],"%d",&ns);
-  FLT ker1[MAXNS],ker2[MAXNS];
-
-  std::vector<FLT> du(2*n*n);      // U "input" array, with...
-  for (int i=0;i<2*n*n;++i)        // something in it
+int main(int argc, char *argv[]) {
+  int M = 10000000; // NU pts
+  int n = 2000;     // U grid pts per dimension (needn't be huge)
+  if (argc > 1) sscanf(argv[1], "%d", &M);
+  if (argc > 2) sscanf(argv[2], "%d", &n);
+  int ns = 10; // kernel width
+  if (argc > 3) sscanf(argv[3], "%d", &ns);
+  FLT ker1[MAXNS], ker2[MAXNS];
+
+  std::vector<FLT> du(2 * n * n);     // U "input" array, with...
+  for (int i = 0; i < 2 * n * n; ++i) // something in it
     du[i] = (FLT)i;
-  
-  clock_t start=clock();
-  FLT tot[2] = {0.0,0.0};    // complex output total
-  int N1=n, N2=n;
-  int i1=n/4, i2=n/4+7;      // starting pt for bottom left coords of interp box
-
-  for (int i=0;i<M;++i) {   // loop over NU pts ..............
-    for (int j=0;j<ns;++j) {  // some fixed 1d ker evals, dep on NU pt
-      ker1[j] = 1.0 - 0.1*(j-4.7)*(j-4.6) + ((FLT)i)*1e-7;;
-      ker2[j] = 0.7 - 0.04*(j-3.7)*(j-3.2) + ((FLT)i)*(-0.6e-7);
+
+  clock_t start = clock();
+  FLT tot[2]    = {0.0, 0.0};      // complex output total
+  int N1 = n, N2 = n;
+  int i1 = n / 4, i2 = n / 4 + 7;  // starting pt for bottom left coords of interp box
+
+  for (int i = 0; i < M; ++i) {    // loop over NU pts ..............
+    for (int j = 0; j < ns; ++j) { // some fixed 1d ker evals, dep on NU pt
+      ker1[j] = 1.0 - 0.1 * (j - 4.7) * (j - 4.6) + ((FLT)i) * 1e-7;
+      ;
+      ker2[j] = 0.7 - 0.04 * (j - 3.7) * (j - 3.2) + ((FLT)i) * (-0.6e-7);
     }
-    FLT out[2] = {0.0,0.0};                // re,im for result for each NU pt
+    FLT out[2] = {0.0, 0.0}; // re,im for result for each NU pt
 
     // core loop of interp_square... (no wrapping)
-    for (int dy=0; dy<ns; dy++) {
-      int j = N1*(i2+dy) + i1;
-      for (int dx=0; dx<ns; dx++) {
-	FLT k = ker1[dx]*ker2[dy];
-	out[0] += du[2*j] * k;
-	out[1] += du[2*j+1] * k;
-	++j;
+    for (int dy = 0; dy < ns; dy++) {
+      int j = N1 * (i2 + dy) + i1;
+      for (int dx = 0; dx < ns; dx++) {
+        FLT k = ker1[dx] * ker2[dy];
+        out[0] += du[2 * j] * k;
+        out[1] += du[2 * j + 1] * k;
+        ++j;
       }
     }
-    //printf("i=%d i1=%d i2=%d out=(%g,%g)\n",i,i1,i2,out[0],out[1]);
-    
-    tot[0]+=out[0];  // do something w/ answers
-    tot[1]+=out[1];
-    i1 += 1;         // slowly(!) advance the box corner up and across the grid
+    // printf("i=%d i1=%d i2=%d out=(%g,%g)\n",i,i1,i2,out[0],out[1]);
+
+    tot[0] += out[0]; // do something w/ answers
+    tot[1] += out[1];
+    i1 += 1;          // slowly(!) advance the box corner up and across the grid
     // (since N,M same order, sweeps O(1) times across the U grid, as bin sort)
-    if (i1>3*n/4) {i1-=n/2; i2+=1;}    // keep spread box away from edges
-    //i2 += 57;                // move far in slow direc - causes pain
-    if (i2>3*n/4) i2-=n/2;
-    
-  }                        // .......................
-  double t=(double)(clock()-start)/CLOCKS_PER_SEC;
-  printf("M=%d from N=%d^2, ns=%d: tot[0]=%.15g \t%.3g s\n",M,n,ns,tot[0],t);
-  printf("%.3g spread pts/s\n",M*ns*ns/t);
+    if (i1 > 3 * n / 4) {
+      i1 -= n / 2;
+      i2 += 1;
+    } // keep spread box away from edges
+    // i2 += 57;                // move far in slow direc - causes pain
+    if (i2 > 3 * n / 4) i2 -= n / 2;
+
+  } // .......................
+  double t = (double)(clock() - start) / CLOCKS_PER_SEC;
+  printf("M=%d from N=%d^2, ns=%d: tot[0]=%.15g \t%.3g s\n", M, n, ns, tot[0], t);
+  printf("%.3g spread pts/s\n", M * ns * ns / t);
   return 0;
 }
diff --git a/docs/devnotes.rst b/docs/devnotes.rst
index f95729a5a..2a3683761 100644
--- a/docs/devnotes.rst
+++ b/docs/devnotes.rst
@@ -3,7 +3,7 @@
 Developer notes
 ===============
 
-* Developers needing to update/regenerate documentation in general, including our readthedocs website, see ``docs/README``. Developers changing MATLAB/octave interfaces or docs, also see ``matlab/README``.
+* Developers needing to update/regenerate documentation in general, including our readthedocs website, see ``docs/README``. Developers changing MATLAB/octave interfaces or docs, also see ``matlab/README``. Please also see ``contributing.md`` for code style and git hook guidelines.
 
 * To update the version number, this needs to be done by hand in the following places:
 
diff --git a/examples/cuda/example2d1many.cpp b/examples/cuda/example2d1many.cpp
index e67f8c30d..a5d0ecd5d 100644
--- a/examples/cuda/example2d1many.cpp
+++ b/examples/cuda/example2d1many.cpp
@@ -21,97 +21,101 @@ int main(int argc, char *argv[])
  * example code for 2D Type 1 transformation.
  *
  * To compile the code:
- * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include /loc/to/cufinufft/lib-static/libcufinufft.a
- * -lcudart -lcufft -lnvToolsExt
+ * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include
+ * /loc/to/cufinufft/lib-static/libcufinufft.a -lcudart -lcufft -lnvToolsExt
  *
  * or
  * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib
- * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include -L/loc/to/cufinufft/lib/ -lcufinufft
+ * nvcc example2d1many.cpp -o example2d1many -I/loc/to/cufinufft/include
+ * -L/loc/to/cufinufft/lib/ -lcufinufft
  *
  *
  */
 {
-    std::cout << std::scientific << std::setprecision(3);
-
-    int ier;
-    int N1 = 256;
-    int N2 = 256;
-    int M = 65536;
-    int ntransf = 2;
-    int iflag = 1;
-    float tol = 1e-6;
-
-    float *x, *y;
-    std::complex<float> *c, *fk;
-    cudaMallocHost(&x, M * sizeof(float));
-    cudaMallocHost(&y, M * sizeof(float));
-    cudaMallocHost(&c, M * ntransf * sizeof(std::complex<float>));
-    cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex<float>));
-
-    float *d_x, *d_y;
-    cuFloatComplex *d_c, *d_fk;
-    cudaMalloc(&d_x, M * sizeof(float));
-    cudaMalloc(&d_y, M * sizeof(float));
-    cudaMalloc(&d_c, M * ntransf * sizeof(cuFloatComplex));
-    cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex));
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<float> distr(-1, 1);
-
-    for (int i = 0; i < M; i++) {
-        x[i] = M_PI * distr(eng);
-        y[i] = M_PI * distr(eng);
-    }
-
-    for (int i = 0; i < M * ntransf; i++) {
-        c[i].real(distr(eng));
-        c[i].imag(distr(eng));
-    }
-    cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_y, y, M * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_c, c, M * ntransf * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);
-
-    cufinufftf_plan dplan;
-
-    int dim = 2;
-    int64_t nmodes[3];
-    int type = 1;
-
-    nmodes[0] = N1;
-    nmodes[1] = N2;
-    nmodes[2] = 1;
-
-    ier = cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, NULL);
-
-    ier = cufinufftf_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
-
-    ier = cufinufftf_execute(dplan, d_c, d_fk);
-
-    ier = cufinufftf_destroy(dplan);
-
-    cudaMemcpy(fk, d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex), cudaMemcpyDeviceToHost);
-
-    std::cout << std::endl << "Accuracy check:" << std::endl;
-    int N = N1 * N2;
-    for (int i = 0; i < ntransf; i += 1) {
-        int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
-        std::complex<float> Ft = std::complex<float>(0, 0), J = std::complex<float>(0, 1) * (float)iflag;
-        for (CUFINUFFT_BIGINT j = 0; j < M; ++j)
-            Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
-        int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);                 // index in complex F as 1d array
-        printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2, abs(Ft - fk[it + i * N]));
-        printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2,
-               abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N));
-    }
-
-    cudaFreeHost(x);
-    cudaFreeHost(y);
-    cudaFreeHost(c);
-    cudaFreeHost(fk);
-
-    cudaFree(d_x);
-    cudaFree(d_y);
-    cudaFree(d_c);
-    cudaFree(d_fk);
-    return 0;
+  std::cout << std::scientific << std::setprecision(3);
+
+  int ier;
+  int N1      = 256;
+  int N2      = 256;
+  int M       = 65536;
+  int ntransf = 2;
+  int iflag   = 1;
+  float tol   = 1e-6;
+
+  float *x, *y;
+  std::complex<float> *c, *fk;
+  cudaMallocHost(&x, M * sizeof(float));
+  cudaMallocHost(&y, M * sizeof(float));
+  cudaMallocHost(&c, M * ntransf * sizeof(std::complex<float>));
+  cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex<float>));
+
+  float *d_x, *d_y;
+  cuFloatComplex *d_c, *d_fk;
+  cudaMalloc(&d_x, M * sizeof(float));
+  cudaMalloc(&d_y, M * sizeof(float));
+  cudaMalloc(&d_c, M * ntransf * sizeof(cuFloatComplex));
+  cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex));
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<float> distr(-1, 1);
+
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * distr(eng);
+    y[i] = M_PI * distr(eng);
+  }
+
+  for (int i = 0; i < M * ntransf; i++) {
+    c[i].real(distr(eng));
+    c[i].imag(distr(eng));
+  }
+  cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c, M * ntransf * sizeof(cuFloatComplex), cudaMemcpyHostToDevice);
+
+  cufinufftf_plan dplan;
+
+  int dim = 2;
+  int64_t nmodes[3];
+  int type = 1;
+
+  nmodes[0] = N1;
+  nmodes[1] = N2;
+  nmodes[2] = 1;
+
+  ier = cufinufftf_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, NULL);
+
+  ier = cufinufftf_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
+
+  ier = cufinufftf_execute(dplan, d_c, d_fk);
+
+  ier = cufinufftf_destroy(dplan);
+
+  cudaMemcpy(fk, d_fk, N1 * N2 * ntransf * sizeof(cuFloatComplex),
+             cudaMemcpyDeviceToHost);
+
+  std::cout << std::endl << "Accuracy check:" << std::endl;
+  int N = N1 * N2;
+  for (int i = 0; i < ntransf; i += 1) {
+    int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
+    std::complex<float> Ft = std::complex<float>(0, 0),
+                        J  = std::complex<float>(0, 1) * (float)iflag;
+    for (CUFINUFFT_BIGINT j = 0; j < M; ++j)
+      Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+    int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+    printf("[gpu %3d] one mode: abs err in F[%d,%d] is %.3g\n", i, nt1, nt2,
+           abs(Ft - fk[it + i * N]));
+    printf("[gpu %3d] one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2,
+           abs(Ft - fk[it + i * N]) / infnorm(N, fk + i * N));
+  }
+
+  cudaFreeHost(x);
+  cudaFreeHost(y);
+  cudaFreeHost(c);
+  cudaFreeHost(fk);
+
+  cudaFree(d_x);
+  cudaFree(d_y);
+  cudaFree(d_c);
+  cudaFree(d_fk);
+  return 0;
 }
diff --git a/examples/cuda/example2d2many.cpp b/examples/cuda/example2d2many.cpp
index f35b10205..a6b0c6d3e 100644
--- a/examples/cuda/example2d2many.cpp
+++ b/examples/cuda/example2d2many.cpp
@@ -21,106 +21,109 @@ int main(int argc, char *argv[])
  * example code for 2D Type 1 transformation.
  *
  * To compile the code:
- * nvcc example2d2many.cpp -o example2d2many loc/to/cufinufft/lib-static/libcufinufft.a -I/loc/to/cufinufft/include
- * -lcudart -lcufft -lnvToolsExt
+ * nvcc example2d2many.cpp -o example2d2many loc/to/cufinufft/lib-static/libcufinufft.a
+ * -I/loc/to/cufinufft/include -lcudart -lcufft -lnvToolsExt
  *
  * or
  * export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/loc/to/cufinufft/lib
- * nvcc example2d2many.cpp -L/loc/to/cufinufft/lib/ -I/loc/to/cufinufft/include -o example2d1 -lcufinufft
+ * nvcc example2d2many.cpp -L/loc/to/cufinufft/lib/ -I/loc/to/cufinufft/include -o
+ * example2d1 -lcufinufft
  *
  *
  */
 {
-    std::cout << std::scientific << std::setprecision(3);
-
-    int ier;
-    int N1 = 128;
-    int N2 = 128;
-    int M = 10;
-    int ntransf = 4;
-    int maxbatchsize = 4;
-    int iflag = 1;
-    double tol = 1e-6;
-
-    double *x, *y;
-    std::complex<double> *c, *fk;
-    cudaMallocHost(&x, M * sizeof(double));
-    cudaMallocHost(&y, M * sizeof(double));
-    cudaMallocHost(&c, M * ntransf * sizeof(std::complex<double>));
-    cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex<double>));
-
-    double *d_x, *d_y;
-    cuDoubleComplex *d_c, *d_fk;
-    cudaMalloc(&d_x, M * sizeof(double));
-    cudaMalloc(&d_y, M * sizeof(double));
-    cudaMalloc(&d_c, M * ntransf * sizeof(cuDoubleComplex));
-    cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex));
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<double> distr(-1, 1);
-
-    for (int i = 0; i < M; i++) {
-        x[i] = M_PI * distr(eng);
-        y[i] = M_PI * distr(eng);
-    }
-
-    for (int i = 0; i < N1 * N2 * ntransf; i++) {
-        fk[i].real(distr(eng));
-        fk[i].imag(distr(eng));
-    }
-    cudaMemcpy(d_x, x, M * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_y, y, M * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_fk, fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
-
-    cufinufft_plan dplan;
-
-    int dim = 2;
-    int64_t nmodes[3];
-    int type = 2;
-
-    nmodes[0] = N1;
-    nmodes[1] = N2;
-    nmodes[2] = 1;
-
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-    opts.gpu_maxbatchsize = maxbatchsize;
-
-    ier = cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-
-    ier = cufinufft_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
-
-    ier = cufinufft_execute(dplan, d_c, d_fk);
-
-    ier = cufinufft_destroy(dplan);
-
-    cudaMemcpy(c, d_c, M * ntransf * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
-
-    std::cout << std::endl << "Accuracy check:" << std::endl;
-    std::complex<double> *fkstart;
-    std::complex<double> *cstart;
-    for (int t = 0; t < ntransf; t++) {
-        fkstart = fk + t * N1 * N2;
-        cstart = c + t * M;
-        int jt = M / 2; // check arbitrary choice of one targ pt
-        std::complex<double> J(0, iflag * 1);
-        std::complex<double> ct(0, 0);
-        int m = 0;
-        for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
-            for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-                ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
-
-        printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", t, jt, abs(cstart[jt] - ct) / infnorm(M, c));
-    }
-
-    cudaFreeHost(x);
-    cudaFreeHost(y);
-    cudaFreeHost(c);
-    cudaFreeHost(fk);
-
-    cudaFree(d_x);
-    cudaFree(d_y);
-    cudaFree(d_c);
-    cudaFree(d_fk);
-    return 0;
+  std::cout << std::scientific << std::setprecision(3);
+
+  int ier;
+  int N1           = 128;
+  int N2           = 128;
+  int M            = 10;
+  int ntransf      = 4;
+  int maxbatchsize = 4;
+  int iflag        = 1;
+  double tol       = 1e-6;
+
+  double *x, *y;
+  std::complex<double> *c, *fk;
+  cudaMallocHost(&x, M * sizeof(double));
+  cudaMallocHost(&y, M * sizeof(double));
+  cudaMallocHost(&c, M * ntransf * sizeof(std::complex<double>));
+  cudaMallocHost(&fk, N1 * N2 * ntransf * sizeof(std::complex<double>));
+
+  double *d_x, *d_y;
+  cuDoubleComplex *d_c, *d_fk;
+  cudaMalloc(&d_x, M * sizeof(double));
+  cudaMalloc(&d_y, M * sizeof(double));
+  cudaMalloc(&d_c, M * ntransf * sizeof(cuDoubleComplex));
+  cudaMalloc(&d_fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex));
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<double> distr(-1, 1);
+
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * distr(eng);
+    y[i] = M_PI * distr(eng);
+  }
+
+  for (int i = 0; i < N1 * N2 * ntransf; i++) {
+    fk[i].real(distr(eng));
+    fk[i].imag(distr(eng));
+  }
+  cudaMemcpy(d_x, x, M * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_y, y, M * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_fk, fk, N1 * N2 * ntransf * sizeof(cuDoubleComplex),
+             cudaMemcpyHostToDevice);
+
+  cufinufft_plan dplan;
+
+  int dim = 2;
+  int64_t nmodes[3];
+  int type = 2;
+
+  nmodes[0] = N1;
+  nmodes[1] = N2;
+  nmodes[2] = 1;
+
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+  opts.gpu_maxbatchsize = maxbatchsize;
+
+  ier = cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+
+  ier = cufinufft_setpts(dplan, M, d_x, d_y, NULL, 0, NULL, NULL, NULL);
+
+  ier = cufinufft_execute(dplan, d_c, d_fk);
+
+  ier = cufinufft_destroy(dplan);
+
+  cudaMemcpy(c, d_c, M * ntransf * sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
+
+  std::cout << std::endl << "Accuracy check:" << std::endl;
+  std::complex<double> *fkstart;
+  std::complex<double> *cstart;
+  for (int t = 0; t < ntransf; t++) {
+    fkstart = fk + t * N1 * N2;
+    cstart  = c + t * M;
+    int jt  = M / 2; // check arbitrary choice of one targ pt
+    std::complex<double> J(0, iflag * 1);
+    std::complex<double> ct(0, 0);
+    int m = 0;
+    for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+      for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+        ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+
+    printf("[gpu %3d] one targ: rel err in c[%d] is %.3g\n", t, jt,
+           abs(cstart[jt] - ct) / infnorm(M, c));
+  }
+
+  cudaFreeHost(x);
+  cudaFreeHost(y);
+  cudaFreeHost(c);
+  cudaFreeHost(fk);
+
+  cudaFree(d_x);
+  cudaFree(d_y);
+  cudaFree(d_c);
+  cudaFree(d_fk);
+  return 0;
 }
diff --git a/examples/cuda/getting_started.cpp b/examples/cuda/getting_started.cpp
index 113a73e7c..da2bf6f5f 100644
--- a/examples/cuda/getting_started.cpp
+++ b/examples/cuda/getting_started.cpp
@@ -26,90 +26,91 @@
 #include <stdlib.h>
 
 int main() {
-    // Problem size: number of nonuniform points (M) and grid size (N).
-    const int M = 100000, N = 10000;
+  // Problem size: number of nonuniform points (M) and grid size (N).
+  const int M = 100000, N = 10000;
 
-    // Size of the grid as an array.
-    int64_t modes[1] = {N};
+  // Size of the grid as an array.
+  int64_t modes[1] = {N};
 
-    // Host pointers: frequencies (x), coefficients (c), and output (f).
-    float *x;
-    float _Complex *c;
-    float _Complex *f;
+  // Host pointers: frequencies (x), coefficients (c), and output (f).
+  float *x;
+  float _Complex *c;
+  float _Complex *f;
 
-    // Device pointers.
-    float *d_x;
-    cuFloatComplex *d_c, *d_f;
+  // Device pointers.
+  float *d_x;
+  cuFloatComplex *d_c, *d_f;
 
-    // Store cufinufft plan.
-    cufinufftf_plan plan;
+  // Store cufinufft plan.
+  cufinufftf_plan plan;
 
-    // Manual calculation at a single point idx.
-    int idx;
-    float _Complex f0;
+  // Manual calculation at a single point idx.
+  int idx;
+  float _Complex f0;
 
-    // Allocate the host arrays.
-    x = (float *)malloc(M * sizeof(float));
-    c = (float _Complex *)malloc(M * sizeof(float _Complex));
-    f = (float _Complex *)malloc(N * sizeof(float _Complex));
+  // Allocate the host arrays.
+  x = (float *)malloc(M * sizeof(float));
+  c = (float _Complex *)malloc(M * sizeof(float _Complex));
+  f = (float _Complex *)malloc(N * sizeof(float _Complex));
 
-    // Fill with random numbers. Frequencies must be in the interval [-pi, pi)
-    // while strengths can be any value.
-    srand(0);
+  // Fill with random numbers. Frequencies must be in the interval [-pi, pi)
+  // while strengths can be any value.
+  srand(0);
 
-    for (int j = 0; j < M; ++j) {
-        x[j] = 2 * M_PI * (((float)rand()) / RAND_MAX - 1);
-        c[j] = (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
-    }
+  for (int j = 0; j < M; ++j) {
+    x[j] = 2 * M_PI * (((float)rand()) / RAND_MAX - 1);
+    c[j] =
+        (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
+  }
 
-    // Allocate the device arrays and copy the x and c arrays.
-    cudaMalloc(&d_x, M * sizeof(float));
-    cudaMalloc(&d_c, M * sizeof(float _Complex));
-    cudaMalloc(&d_f, N * sizeof(float _Complex));
+  // Allocate the device arrays and copy the x and c arrays.
+  cudaMalloc(&d_x, M * sizeof(float));
+  cudaMalloc(&d_c, M * sizeof(float _Complex));
+  cudaMalloc(&d_f, N * sizeof(float _Complex));
 
-    cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
 
-    // Make the cufinufft plan for a 1D type-1 transform with six digits of
-    // tolerance.
-    cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
+  // Make the cufinufft plan for a 1D type-1 transform with six digits of
+  // tolerance.
+  cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
 
-    // Set the frequencies of the nonuniform points.
-    cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
+  // Set the frequencies of the nonuniform points.
+  cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
 
-    // Actually execute the plan on the given coefficients and store the result
-    // in the d_f array.
-    cufinufftf_execute(plan, d_c, d_f);
+  // Actually execute the plan on the given coefficients and store the result
+  // in the d_f array.
+  cufinufftf_execute(plan, d_c, d_f);
 
-    // Copy the result back onto the host.
-    cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
+  // Copy the result back onto the host.
+  cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
 
-    // Destroy the plan and free the device arrays after we're done.
-    cufinufftf_destroy(plan);
+  // Destroy the plan and free the device arrays after we're done.
+  cufinufftf_destroy(plan);
 
-    cudaFree(d_x);
-    cudaFree(d_c);
-    cudaFree(d_f);
+  cudaFree(d_x);
+  cudaFree(d_c);
+  cudaFree(d_f);
 
-    // Pick an index to check the result of the calculation.
-    idx = 4 * N / 7;
+  // Pick an index to check the result of the calculation.
+  idx = 4 * N / 7;
 
-    printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
+  printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
 
-    // Calculate the result manually using the formula for the type-1
-    // transform.
-    f0 = 0;
+  // Calculate the result manually using the formula for the type-1
+  // transform.
+  f0 = 0;
 
-    for (int j = 0; j < M; ++j) {
-        f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
-    }
+  for (int j = 0; j < M; ++j) {
+    f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
+  }
 
-    printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
+  printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
 
-    // Finally free the host arrays.
-    free(x);
-    free(c);
-    free(f);
+  // Finally free the host arrays.
+  free(x);
+  free(c);
+  free(f);
 
-    return 0;
+  return 0;
 }
diff --git a/examples/guru1d1.cpp b/examples/guru1d1.cpp
index eb7189da0..35c626093 100644
--- a/examples/guru1d1.cpp
+++ b/examples/guru1d1.cpp
@@ -1,87 +1,90 @@
 // this is all you must include for the finufft lib...
-#include <finufft.h>
 #include <complex>
+#include <finufft.h>
 
 // specific to this example...
+#include <cassert>
 #include <math.h>
-#include <vector>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 
 // only good for small projects...
 using namespace std;
 // allows 1i to be the imaginary unit... (C++14 onwards)
 using namespace std::complex_literals;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example calling guru C++ interface to FINUFFT library, passing
    pointers to STL vectors of C++ double complex numbers, with a math check.
    Barnett 2/27/20
 
    Compile on linux with (or see ../makefile):
-   g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1  -lfftw3 -lfftw3_omp -lm
+   g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1
+   -lfftw3 -lfftw3_omp -lm
 
    Or if you have built a single-thread library, remove -fopenmp and -lfftw3_omp
 
    Usage: ./guru1d1
 */
 {
-  int M = 1e6;            // number of nonuniform points
-  int N = 1e6;            // number of modes
-  double tol = 1e-9;      // desired accuracy
+  int M      = 1e6;      // number of nonuniform points
+  int N      = 1e6;      // number of modes
+  double tol = 1e-9;     // desired accuracy
 
-  int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3];              // guru describes mode array by vector [N1,N2..]
-  Ns[0] = N;
-  int ntransf = 1;           // we want to do a single transform at a time
-  finufft_plan plan;         // creates a plan struct
-  int changeopts = 0;        // do you want to try changing opts? 0 or 1
-  if (changeopts) {          // demo how to change options away from defaults..
+  int type = 1, dim = 1; // 1d1
+  int64_t Ns[3];         // guru describes mode array by vector [N1,N2..]
+  Ns[0]       = N;
+  int ntransf = 1;       // we want to do a single transform at a time
+  finufft_plan plan;     // creates a plan struct
+  int changeopts = 0;    // do you want to try changing opts? 0 or 1
+  if (changeopts) {      // demo how to change options away from defaults..
     finufft_opts opts;
     finufft_default_opts(&opts);
-    opts.debug = 1;          // example options change
+    opts.debug = 1; // example options change
     finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, &opts);
-  } else                     // or, NULL here means use default opts...
+  } else            // or, NULL here means use default opts...
     finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL);
 
   // generate some random nonuniform points
   vector<double> x(M);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
+  for (int j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
   // note FINUFFT doesn't use std::vector types, so we need to make a pointer...
   finufft_setpts(plan, M, x.data(), NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate some complex strengths
   vector<complex<double>> c(M);
-  for (int j=0; j<M; ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
 
   // alloc output array for the Fourier modes, then do the transform
   vector<complex<double>> F(N);
   int ier = finufft_execute(plan, c.data(), F.data());
 
   // for fun, do another with same NU pts (no re-sorting), but new strengths...
-  for (int j=0; j<M; ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
   ier = finufft_execute(plan, c.data(), F.data());
 
-  finufft_destroy(plan);    // don't forget! done with transforms of this size
+  finufft_destroy(plan); // don't forget! done with transforms of this size
 
   // rest is math checking and reporting...
-  int n = 142519;   // check the answer just for this mode
-  assert(n>=-(double)N/2 && n<(double)N/2);     // ensure meaningful test
-  complex<double> Ftest = complex<double>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(1i*(double)n*x[j]);
-  int nout = n+N/2;        // index in output array for freq mode n
+  int n = 142519;                                   // check the answer just for this mode
+  assert(n >= -(double)N / 2 && n < (double)N / 2); // ensure meaningful test
+  complex<double> Ftest = complex<double>(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(1i * (double)n * x[j]);
+  int nout    = n + N / 2; // index in output array for freq mode n
   double Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  for (int m = 0; m < N; ++m) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  double err = abs(F[nout] - Ftest)/Fmax;
-  printf("guru 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,n,err);
+  double err = abs(F[nout] - Ftest) / Fmax;
+  printf("guru 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier,
+         n, err);
 
   return ier;
 }
diff --git a/examples/guru1d1c.c b/examples/guru1d1c.c
index 7ad036f4b..ebd337127 100644
--- a/examples/guru1d1c.c
+++ b/examples/guru1d1c.c
@@ -2,84 +2,91 @@
 #include <finufft.h>
 
 // specific to this example...
+#include <complex.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <complex.h>
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example calling guru interface to FINUFFT library from C, using
    C complex type, with a math check. Barnett 6/22/20.
 
    Compile on linux with:
-   gcc-7 -fopenmp guru1d1c.c -I../include ../lib-static/libfinufft.a -o guru1d1c  -lfftw3 -lfftw3_omp -lm -lstdc++
+   gcc-7 -fopenmp guru1d1c.c -I../include ../lib-static/libfinufft.a -o guru1d1c  -lfftw3
+   -lfftw3_omp -lm -lstdc++
 
    Or if you have built a single-core library, remove -fopenmp and -lfftw3_omp
 
    Usage: ./guru1d1c.  See also: guru1d1
 */
 {
-  int M = 1e6;            // number of nonuniform points
-  int N = 1e6;            // number of modes
-  double tol = 1e-9;      // desired accuracy
+  int M      = 1e6;      // number of nonuniform points
+  int N      = 1e6;      // number of modes
+  double tol = 1e-9;     // desired accuracy
 
-  int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3];              // guru describes mode array by vector [N1,N2..]
-  int ntransf = 1;           // we want to do a single transform at a time
-  int64_t j,m,nout;
+  int type = 1, dim = 1; // 1d1
+  int64_t Ns[3];         // guru describes mode array by vector [N1,N2..]
+  int ntransf = 1;       // we want to do a single transform at a time
+  int64_t j, m, nout;
   int ier;
-  double *x,err,Fmax,aF;
-  double complex *c,*F,Ftest;
+  double *x, err, Fmax, aF;
+  double complex *c, *F, Ftest;
 
-  finufft_opts* popts;         // pointer to opts struct
-  finufft_plan plan;         // pointer to (also C-compatible) plan struct
-  Ns[0] = N;                 // mode numbers for plan
-  int changeopts = 0;        // do you want to try changing opts? 0 or 1
-  if (changeopts) {          // demo how to change options away from defaults..
-    popts = (finufft_opts *)malloc(sizeof(finufft_opts));         // allocate it
+  finufft_opts *popts; // pointer to opts struct
+  finufft_plan plan;   // pointer to (also C-compatible) plan struct
+  Ns[0]          = N;  // mode numbers for plan
+  int changeopts = 0;  // do you want to try changing opts? 0 or 1
+  if (changeopts) {    // demo how to change options away from defaults..
+    popts = (finufft_opts *)malloc(sizeof(finufft_opts)); // allocate it
     finufft_default_opts(popts);
-    popts->debug = 1;        // example options change
-    popts->nthreads = 4;     // "
+    popts->debug    = 1;                                  // example options change
+    popts->nthreads = 4;                                  // "
     finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, popts);
-  } else                     // or, NULL here means use default opts...
+  } else // or, NULL here means use default opts...
     finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL);
-  
+
   // generate some random nonuniform points
-  x = (double *)malloc(sizeof(double)*M);
-  for (j=0; j<M; ++j)
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
+  x = (double *)malloc(sizeof(double) * M);
+  for (j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
   finufft_setpts(plan, M, x, NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate some complex strengths
-  c = (double complex*)malloc(sizeof(double complex)*M);
-  for (j=0; j<M; ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  c = (double complex *)malloc(sizeof(double complex) * M);
+  for (j = 0; j < M; ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
 
   // alloc output array for the Fourier modes, then do the transform
-  F = (double complex*)malloc(sizeof(double complex)*N);
+  F   = (double complex *)malloc(sizeof(double complex) * N);
   ier = finufft_execute(plan, c, F);
 
   // for fun, do another with same NU pts (no re-sorting), but new strengths...
-  for (j=0; j<M; ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  for (j = 0; j < M; ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   ier = finufft_execute(plan, c, F);
 
-  finufft_destroy(plan);    // done with transforms of this size
+  finufft_destroy(plan); // done with transforms of this size
 
   // rest is math checking and reporting...
-  int n = 142519;   // check the answer just for this mode
-  Ftest = 0.0 + 0.0*I;
-  for (j=0; j<M; ++j)
-    Ftest += c[j] * cexp(I*(double)n*x[j]);
-  nout = n+N/2;            // index in output array for freq mode n
-  Fmax = 0.0;              // compute inf norm of F
-  for (m=0; m<N; ++m) {
+  int n = 142519; // check the answer just for this mode
+  Ftest = 0.0 + 0.0 * I;
+  for (j = 0; j < M; ++j) Ftest += c[j] * cexp(I * (double)n * x[j]);
+  nout = n + N / 2; // index in output array for freq mode n
+  Fmax = 0.0;       // compute inf norm of F
+  for (m = 0; m < N; ++m) {
     aF = cabs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  err = cabs(F[nout] - Ftest)/Fmax;
-  printf("guru C-interface 1D type-1 NUFFT done. ier=%d, err in F[%d] rel to max(F) is %.3g\n",ier,n,err);
+  err = cabs(F[nout] - Ftest) / Fmax;
+  printf("guru C-interface 1D type-1 NUFFT done. ier=%d, err in F[%d] rel to max(F) is "
+         "%.3g\n",
+         ier, n, err);
 
-  free(x); free(c); free(F); free(popts);
-  return ier>1;
+  free(x);
+  free(c);
+  free(F);
+  free(popts);
+  return ier > 1;
 }
diff --git a/examples/guru1d1f.cpp b/examples/guru1d1f.cpp
index a46c4a735..d890d3081 100644
--- a/examples/guru1d1f.cpp
+++ b/examples/guru1d1f.cpp
@@ -1,85 +1,88 @@
 // this is all you must include for the finufft lib...
-#include <finufft.h>
 #include <complex>
+#include <finufft.h>
 
 // specific to this example...
 #include <math.h>
-#include <vector>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
 // only good for small projects...
 using namespace std;
 // allows 1i to be the imaginary unit... (C++14 onwards)
 using namespace std::complex_literals;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example calling guru C++ interface to FINUFFT library, single-prec, passing
    pointers to STL vectors of C++ float complex numbers, with a math check.
    Barnett 7/5/20
 
    Compile on linux with:
-   g++-7 -std=c++14 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o guru1d1f  -lfftw3f -lfftw3f_omp -lm
+   g++-7 -std=c++14 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o
+   guru1d1f  -lfftw3f -lfftw3f_omp -lm
 
    Or if you have built a single-core library, remove -fopenmp and -lfftw3f_omp
 
    Usage: ./guru1d1f
 */
 {
-  int M = 1e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes
-  float tol = 1e-5;       // desired accuracy
+  int M     = 1e5;                // number of nonuniform points
+  int N     = 1e5;                // number of modes
+  float tol = 1e-5;               // desired accuracy
 
-  int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3];              // guru describes mode array by vector [N1,N2..]
-  Ns[0] = N;
-  int ntransf = 1;           // we want to do a single transform at a time
-  finufftf_plan plan;        // creates single-prec plan struct: note the "f"
-  int changeopts = 1;        // do you want to try changing opts? 0 or 1
-  if (changeopts) {          // demo how to change options away from defaults..
+  int type = 1, dim = 1;          // 1d1
+  int64_t Ns[3];                  // guru describes mode array by vector [N1,N2..]
+  Ns[0]       = N;
+  int ntransf = 1;                // we want to do a single transform at a time
+  finufftf_plan plan;             // creates single-prec plan struct: note the "f"
+  int changeopts = 1;             // do you want to try changing opts? 0 or 1
+  if (changeopts) {               // demo how to change options away from defaults..
     finufft_opts opts;
-    finufftf_default_opts(&opts);   // note "f" for single-prec, throughout...
-    opts.debug = 2;          // example options change
+    finufftf_default_opts(&opts); // note "f" for single-prec, throughout...
+    opts.debug = 2;               // example options change
     finufftf_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, &opts);
-  } else                     // or, NULL here means use default opts...
+  } else                          // or, NULL here means use default opts...
     finufftf_makeplan(type, dim, Ns, +1, ntransf, tol, &plan, NULL);
 
   // generate some random nonuniform points
   vector<float> x(M);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((float)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
+  for (int j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((float)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
   // note FINUFFT doesn't use std::vector types, so we need to make a pointer...
   finufftf_setpts(plan, M, &x[0], NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate some complex strengths
   vector<complex<float>> c(M);
-  for (int j=0; j<M; ++j)
-    c[j] = 2*((float)rand()/RAND_MAX)-1 + 1if*(2*((float)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j)
+    c[j] =
+        2 * ((float)rand() / RAND_MAX) - 1 + 1if * (2 * ((float)rand() / RAND_MAX) - 1);
 
   // alloc output array for the Fourier modes, then do the transform
   vector<complex<float>> F(N);
   int ier = finufftf_execute(plan, &c[0], &F[0]);
 
   // for fun, do another with same NU pts (no re-sorting), but new strengths...
-  for (int j=0; j<M; ++j)
-    c[j] = 2*((float)rand()/RAND_MAX)-1 + 1if*(2*((float)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j)
+    c[j] =
+        2 * ((float)rand() / RAND_MAX) - 1 + 1if * (2 * ((float)rand() / RAND_MAX) - 1);
   ier = finufftf_execute(plan, &c[0], &F[0]);
 
-  finufftf_destroy(plan);    // done with transforms of this size
+  finufftf_destroy(plan); // done with transforms of this size
 
   // rest is math checking and reporting...
-  int n = 12519;   // check the answer just for this mode, must be in [-N/2,N/2)
-  complex<float> Ftest = complex<float>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(1if*(float)n*x[j]);
-  int nout = n+N/2;        // index in output array for freq mode n
-  float Fmax = 0.0;        // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  int n = 12519; // check the answer just for this mode, must be in [-N/2,N/2)
+  complex<float> Ftest = complex<float>(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(1if * (float)n * x[j]);
+  int nout   = n + N / 2; // index in output array for freq mode n
+  float Fmax = 0.0;       // compute inf norm of F
+  for (int m = 0; m < N; ++m) {
     float aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  float err = abs(F[nout] - Ftest)/Fmax;
-  printf("guru 1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,n,err);
+  float err = abs(F[nout] - Ftest) / Fmax;
+  printf("guru 1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier,
+         n, err);
 
   return ier;
 }
diff --git a/examples/guru2d1.cpp b/examples/guru2d1.cpp
index 06d25e064..cfc39109e 100644
--- a/examples/guru2d1.cpp
+++ b/examples/guru2d1.cpp
@@ -1,51 +1,53 @@
 #include <finufft.h>
 
 #include <complex>
-#include <iostream>
 #include <iomanip>
+#include <iostream>
 #include <vector>
 using namespace std;
 
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[]) {
+
+  /* 2D type 1 guru interface example of calling the FINUFFT library from C++,
+     using STL double complex vectors, with a math test. Similar to simple2d1
+     except illustrates the guru interface.
 
-/* 2D type 1 guru interface example of calling the FINUFFT library from C++,
-   using STL double complex vectors, with a math test. Similar to simple2d1
-   except illustrates the guru interface.
+     Compile multithreaded with
+     g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3
+     -lfftw3_omp -lm single core with: g++ guru2d1.cpp -I ../src
+     ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lm
 
-   Compile multithreaded with
-   g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lfftw3_omp -lm
-   single core with:
-   g++ guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lm
-   
-   Usage:  ./guru2d1
-*/
-  int M = 1e6;                 // number of nonuniform points
-  int N = 1e6;                 // approximate total number of modes (N1*N2)
-  double tol = 1e-6;           // desired accuracy
-  finufft_opts opts; finufft_default_opts(&opts);
+     Usage:  ./guru2d1
+  */
+  int M      = 1e6;  // number of nonuniform points
+  int N      = 1e6;  // approximate total number of modes (N1*N2)
+  double tol = 1e-6; // desired accuracy
+  finufft_opts opts;
+  finufft_default_opts(&opts);
   opts.upsampfac = 1.25;
   complex<double> I(0.0, 1.0); // the imaginary unit
 
   // generate random non-uniform points on (x,y) and complex strengths (c):
   vector<double> x(M), y(M);
-  vector<complex<double> > c(M);
+  vector<complex<double>> c(M);
 
-  for(int i = 0; i < M; i++){
-    x[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi)
-    y[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi)
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi)
+    y[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi)
     // each component uniform random in [-1,1]
-    c[i] = 2*((double)rand()/RAND_MAX-1) + I*(2*((double)rand()/RAND_MAX)-1); 
+    c[i] =
+        2 * ((double)rand() / RAND_MAX - 1) + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
 
   // choose numbers of output Fourier coefficients in each dimension
-  int N1 = round(2.0*sqrt(N));
-  int N2 = round(N/N1);
-  
+  int N1 = round(2.0 * sqrt(N));
+  int N2 = round(N / N1);
+
   // output array for the Fourier modes
-  vector<complex<double> > F(N1*N2);
+  vector<complex<double>> F(N1 * N2);
 
-  int type=1, dim=2, ntrans=1;               // you could also do ntrans>1
-  int64_t Ns[] = {N1,N2};                    // N1,N2 as 64-bit int array
+  int type = 1, dim = 2, ntrans = 1; // you could also do ntrans>1
+  int64_t Ns[] = {N1, N2};           // N1,N2 as 64-bit int array
   // step 1: make a plan...
   finufft_plan plan;
   int ier = finufft_makeplan(type, dim, Ns, +1, ntrans, tol, &plan, NULL);
@@ -58,27 +60,28 @@ int main(int argc, char *argv[]){
   // step 4: free the memory used by the plan...
   finufft_destroy(plan);
 
-  int k1 = round(0.45*N1);    // check the answer for mode frequency (k1,k2)
-  int k2 = round(-0.35*N2);
-  
-  complex<double> Ftest(0,0);
-  for(int j = 0; j < M; j++)
-    Ftest += c[j]*exp(I*((double)k1*x[j]+(double)k2*y[j]));
+  int k1 = round(0.45 * N1); // check the answer for mode frequency (k1,k2)
+  int k2 = round(-0.35 * N2);
+
+  complex<double> Ftest(0, 0);
+  for (int j = 0; j < M; j++)
+    Ftest += c[j] * exp(I * ((double)k1 * x[j] + (double)k2 * y[j]));
 
-  // compute inf norm of F 
+  // compute inf norm of F
   double Fmax = 0.0;
-  for (int m=0; m<N1*N2; m++) {
+  for (int m = 0; m < N1 * N2; m++) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  
+
   // indices in output array for this frequency pair (k1,k2)
-  int k1out = k1 + (int)N1/2; 
-  int k2out = k2 + (int)N2/2;
-  int indexOut = k1out + k2out*(N1);
+  int k1out    = k1 + (int)N1 / 2;
+  int k2out    = k2 + (int)N2 / 2;
+  int indexOut = k1out + k2out * (N1);
 
   // compute relative error
-  double err = abs(F[indexOut] - Ftest)/Fmax;
-  cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut << "] rel to max(F) is " << setprecision(2) << err << endl;
+  double err = abs(F[indexOut] - Ftest) / Fmax;
+  cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut
+       << "] rel to max(F) is " << setprecision(2) << err << endl;
   return ier;
 }
diff --git a/examples/gurumany1d1.cpp b/examples/gurumany1d1.cpp
index 8f150e609..01503af14 100644
--- a/examples/gurumany1d1.cpp
+++ b/examples/gurumany1d1.cpp
@@ -1,4 +1,4 @@
-/* Demonstrate guru FINUFFT interface performing a stack of 1d type 1 
+/* Demonstrate guru FINUFFT interface performing a stack of 1d type 1
    transforms in a single execute call. See guru1d1.cpp for other guru
    features demonstrated.
 
@@ -11,70 +11,73 @@
 */
 
 // this is all you must include for the finufft lib...
-#include <finufft.h>
 #include <complex>
+#include <finufft.h>
 
 // specific to this demo...
+#include <cassert>
 #include <math.h>
-#include <vector>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 
 // only good for small projects...
 using namespace std;
 // allows 1i to be the imaginary unit... (C++14 onwards)
 using namespace std::complex_literals;
 
-int main(int argc, char* argv[])
-{
-  int M = 2e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes
-  double tol = 1e-9;      // desired accuracy
-  int ntrans = 100;       // request a bunch of transforms in the execute
-  int isign = +1;         // sign of i in the transform math definition
-  
+int main(int argc, char *argv[]) {
+  int M      = 2e5;          // number of nonuniform points
+  int N      = 1e5;          // number of modes
+  double tol = 1e-9;         // desired accuracy
+  int ntrans = 100;          // request a bunch of transforms in the execute
+  int isign  = +1;           // sign of i in the transform math definition
+
   int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3] = {N,0,0};   // guru describes mode array by vector [N1,N2..]
+  int64_t Ns[3] = {N, 0, 0}; // guru describes mode array by vector [N1,N2..]
   finufft_plan plan;         // creates a plan struct (NULL below: default opts)
   finufft_makeplan(type, dim, Ns, isign, ntrans, tol, &plan, NULL);
 
   // generate random nonuniform points and pass to FINUFFT
   vector<double> x(M);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
+  for (int j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
   finufft_setpts(plan, M, x.data(), NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate ntrans complex strength vectors each of length M (the slow bit!)
-  vector<complex<double>> c(M*ntrans);        // plain contiguous storage
-  for (int j=0; j<M*ntrans; ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+  vector<complex<double>> c(M * ntrans); // plain contiguous storage
+  for (int j = 0; j < M * ntrans; ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
 
   // alloc output array for the Fourier modes, then do the transform
-  vector<complex<double>> F(N*ntrans);
-  printf("guru many 1D type-1 double-prec, tol=%.3g, executing %d transforms (vectorized), each size %d NU pts to %d modes...\n",tol,ntrans,M,N);
+  vector<complex<double>> F(N * ntrans);
+  printf("guru many 1D type-1 double-prec, tol=%.3g, executing %d transforms "
+         "(vectorized), each size %d NU pts to %d modes...\n",
+         tol, ntrans, M, N);
   int ier = finufft_execute(plan, c.data(), F.data());
 
   // could now change c, do another execute, do another setpts, execute, etc...
-  
-  finufft_destroy(plan);  // don't forget! we're done with transforms of this size
-  
+
+  finufft_destroy(plan); // don't forget! we're done with transforms of this size
+
   // rest is math checking and reporting...
-  int k = 42519;   // check the answer just for this mode
-  int trans = 71;   // ...testing in just this transform
-  assert(k>=-(double)N/2 && k<(double)N/2);     // ensure meaningful test
-  assert(trans>=0 && trans<ntrans);
-  complex<double> Ftest = complex<double>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j+M*trans] * exp(1i*(double)k*x[j]);   // c offset to trans
-  double Fmax = 0.0;       // compute inf norm of F for selected transform
-  for (int m=0; m<N; ++m) {
-    double aF = abs(F[m+N*trans]);
-    if (aF>Fmax) Fmax=aF;
+  int k     = 42519;                                // check the answer just for this mode
+  int trans = 71;                                   // ...testing in just this transform
+  assert(k >= -(double)N / 2 && k < (double)N / 2); // ensure meaningful test
+  assert(trans >= 0 && trans < ntrans);
+  complex<double> Ftest = complex<double>(0, 0);
+  for (int j = 0; j < M; ++j)
+    Ftest += c[j + M * trans] * exp(1i * (double)k * x[j]); // c offset to trans
+  double Fmax = 0.0; // compute inf norm of F for selected transform
+  for (int m = 0; m < N; ++m) {
+    double aF = abs(F[m + N * trans]);
+    if (aF > Fmax) Fmax = aF;
   }
-  int nout = k+N/2 + N*trans;   // output index for freq mode k in the trans
-  double err = abs(F[nout] - Ftest)/Fmax;
-  printf("\tdone: ier=%d; for transform %d, rel err in F[%d] is %.3g\n",ier,trans,k,err);
+  int nout   = k + N / 2 + N * trans; // output index for freq mode k in the trans
+  double err = abs(F[nout] - Ftest) / Fmax;
+  printf("\tdone: ier=%d; for transform %d, rel err in F[%d] is %.3g\n", ier, trans, k,
+         err);
 
   return ier;
 }
diff --git a/examples/many1d1.cpp b/examples/many1d1.cpp
index 8176007c9..4b884d028 100644
--- a/examples/many1d1.cpp
+++ b/examples/many1d1.cpp
@@ -1,59 +1,61 @@
 #include <finufft.h>
 
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <cstdio>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example of calling the vectorized FINUFFT library from C++, using STL
    double complex vectors, with a math test.
 
    Compile with:
-   g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lfftw3_omp -lm
-   or if you have built a single-core version:
-   g++ many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lm
+   g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3
+   -lfftw3_omp -lm or if you have built a single-core version: g++ many1d1.cpp
+   -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lm
 
    Usage: ./many1d1
 */
 {
-  int ntrans = 3;         // how many stacked transforms to do
-  int M = 1e6;            // nonuniform points (same for all transforms)
-  int N = 1e6;            // number of modes (same for all transforms)
-  double tol = 1e-9;      // desired accuracy
-  finufft_opts* opts = new finufft_opts;     // opts is pointer to struct
+  int ntrans         = 3;                // how many stacked transforms to do
+  int M              = 1e6;              // nonuniform points (same for all transforms)
+  int N              = 1e6;              // number of modes (same for all transforms)
+  double tol         = 1e-9;             // desired accuracy
+  finufft_opts *opts = new finufft_opts; // opts is pointer to struct
   finufft_default_opts(opts);
-  complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
-  
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
+
   // generate some random nonuniform points (x) and complex strengths (c)...
   vector<double> x(M);
-  vector<complex<double> > c(M*ntrans);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-  for (int j=0; j<M*ntrans; ++j)                  // fill all ntrans vectors...
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  vector<complex<double>> c(M * ntrans);
+  for (int j = 0; j < M; ++j)
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+  for (int j = 0; j < M * ntrans; ++j)                   // fill all ntrans vectors...
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   // allocate output array for the Fourier modes...
-  vector<complex<double> > F(N*ntrans);
+  vector<complex<double>> F(N * ntrans);
 
   // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
-  int ier = finufft1d1many(ntrans,M,&x[0],&c[0],+1,tol,N,&F[0],NULL);
-
-  int k = 142519;          // check the answer just for this mode...
-  int trans = ntrans-1;    // ...in this transform
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  
-  complex<double> Ftest = complex<double>(0,0);  // do the naive calc...
-  for (int j=0; j<M; ++j)
-    Ftest += c[j+M*trans] * exp(I*(double)k*x[j]);   // c from transform # trans
-  double Fmax = 0.0;       // compute inf norm of F for transform # trans
-  for (int m=0; m<N; ++m) {
-    double aF = abs(F[m+N*trans]);
-    if (aF>Fmax) Fmax=aF;
+  int ier = finufft1d1many(ntrans, M, &x[0], &c[0], +1, tol, N, &F[0], NULL);
+
+  int k     = 142519;     // check the answer just for this mode...
+  int trans = ntrans - 1; // ...in this transform
+  assert(k >= -(double)N / 2 && k < (double)N / 2);
+
+  complex<double> Ftest = complex<double>(0, 0);           // do the naive calc...
+  for (int j = 0; j < M; ++j)
+    Ftest += c[j + M * trans] * exp(I * (double)k * x[j]); // c from transform # trans
+  double Fmax = 0.0; // compute inf norm of F for transform # trans
+  for (int m = 0; m < N; ++m) {
+    double aF = abs(F[m + N * trans]);
+    if (aF > Fmax) Fmax = aF;
   }
-  int kout = k+N/2+N*trans;    // output index, freq mode k, transform # trans
-  double err = abs(F[kout] - Ftest)/Fmax;
-  printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err);
+  int kout   = k + N / 2 + N * trans; // output index, freq mode k, transform # trans
+  double err = abs(F[kout] - Ftest) / Fmax;
+  printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k,
+         err);
   return ier;
 }
diff --git a/examples/simple1d1.cpp b/examples/simple1d1.cpp
index 1e7f16858..4e547eafc 100644
--- a/examples/simple1d1.cpp
+++ b/examples/simple1d1.cpp
@@ -2,60 +2,61 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <cstdio>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example of calling the FINUFFT library from C++, using STL
    double complex vectors, with a math test.
    Double-precision version (see simple1d1f for single-precision)
 
    Compile with (static library case):
-   g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3 -lfftw3_omp
-   or if you have built a single-core version:
    g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3
+   -lfftw3_omp or if you have built a single-core version: g++ simple1d1.cpp -I../include
+   ../lib-static/libfinufft.a -o simple1d1 -lfftw3
 
    Usage: ./simple1d1
 
    Also see ../docs/cex.rst or online documentation.
 */
 {
-  int M = 1e6;            // number of nonuniform points
-  int N = 1e6;            // number of modes
-  double acc = 1e-9;      // desired accuracy
-  finufft_opts* opts = new finufft_opts;     // opts is pointer to struct
+  int M              = 1e6;                      // number of nonuniform points
+  int N              = 1e6;                      // number of modes
+  double acc         = 1e-9;                     // desired accuracy
+  finufft_opts *opts = new finufft_opts;         // opts is pointer to struct
   finufft_default_opts(opts);
-  complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
-  
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
+
   // generate some random nonuniform points (x) and complex strengths (c)...
   vector<double> x(M);
-  vector<complex<double> > c(M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  vector<complex<double>> c(M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
   // allocate output array for the Fourier modes...
-  vector<complex<double> > F(N);
-  
+  vector<complex<double>> F(N);
+
   // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
-  int ier = finufft1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts);
-
-  int k = 142519;   // check the answer just for this mode frequency...
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  complex<double> Ftest = complex<double>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(I*(double)k*x[j]);
-  double Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  int ier = finufft1d1(M, &x[0], &c[0], +1, acc, N, &F[0], opts);
+
+  int k = 142519; // check the answer just for this mode frequency...
+  assert(k >= -(double)N / 2 && k < (double)N / 2);
+  complex<double> Ftest = complex<double>(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (double)k * x[j]);
+  double Fmax = 0.0; // compute inf norm of F
+  for (int m = 0; m < N; ++m) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  int kout = k+N/2;        // index in output array for freq mode k
-  double err = abs(F[kout] - Ftest)/Fmax;
-  printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err);
+  int kout   = k + N / 2; // index in output array for freq mode k
+  double err = abs(F[kout] - Ftest) / Fmax;
+  printf("1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k,
+         err);
   return ier;
 }
diff --git a/examples/simple1d1c.c b/examples/simple1d1c.c
index b3c718659..5f5c3e565 100644
--- a/examples/simple1d1c.c
+++ b/examples/simple1d1c.c
@@ -2,60 +2,64 @@
 #include <finufft.h>
 
 // also needed for this example...
-#include <stdlib.h>
-#include <math.h>
+#include <assert.h>
 #include <complex.h>
+#include <math.h>
 #include <stdio.h>
-#include <assert.h>
+#include <stdlib.h>
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Simple example of calling the FINUFFT library from C, using C complex type,
    with a math test. Double-precision. C99 style. opts is struct not ptr to it.
 
    Compile with:
-   gcc -fopenmp example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lfftw3 -lfftw3_omp -lm -lstdc++
-   or if you have built a single-core version:
-   gcc example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lfftw3 -lm -lstdc++
+   gcc -fopenmp example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c
+   -lfftw3 -lfftw3_omp -lm -lstdc++ or if you have built a single-core version: gcc
+   example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lfftw3 -lm
+   -lstdc++
 
    Usage: ./example1d1c
 */
 {
-  int M = 1e6;            // number of nonuniform points
-  int N = 1e6;            // number of modes
-  double tol = 1e-9;      // desired accuracy
+  int M      = 1e6;  // number of nonuniform points
+  int N      = 1e6;  // number of modes
+  double tol = 1e-9; // desired accuracy
 
   // generate some random nonuniform points (x) and complex strengths (c):
-  double* x = (double *)malloc(sizeof(double)*M);
-  double complex* c = (double complex*)malloc(sizeof(double complex)*M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  double *x         = (double *)malloc(sizeof(double) * M);
+  double complex *c = (double complex *)malloc(sizeof(double complex) * M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
   // allocate complex output array for the Fourier modes
-  double complex* F = (double complex*)malloc(sizeof(double complex)*N);
+  double complex *F = (double complex *)malloc(sizeof(double complex) * N);
+
+  finufft_opts opts;           // opts struct (not ptr)
+  finufft_default_opts(&opts); // set default opts (must do this)
+  opts.debug = 2;              // show how to override a default
+  // opts.upsampfac = 1.25;              // other opts...
 
-  finufft_opts opts;                      // opts struct (not ptr)
-  finufft_default_opts(&opts);          // set default opts (must do this)
-  opts.debug = 2;                       // show how to override a default
-  //opts.upsampfac = 1.25;              // other opts...
-  
   // call the NUFFT (with iflag=+1), passing pointers...
-  int ier = finufft1d1(M,x,c,+1,tol,N,F,&opts);
-
-  int k = 142519;            // check the answer just for this mode...
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  double complex Ftest = 0.0 + 0.0*I;   // defined in complex.h (I too)
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * cexp(I*(double)k*x[j]);
-  double Fmax = 0.0;         // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  int ier = finufft1d1(M, x, c, +1, tol, N, F, &opts);
+
+  int k = 142519;                       // check the answer just for this mode...
+  assert(k >= -(double)N / 2 && k < (double)N / 2);
+  double complex Ftest = 0.0 + 0.0 * I; // defined in complex.h (I too)
+  for (int j = 0; j < M; ++j) Ftest += c[j] * cexp(I * (double)k * x[j]);
+  double Fmax = 0.0;                    // compute inf norm of F
+  for (int m = 0; m < N; ++m) {
     double aF = cabs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  int kout = k+N/2;          // index in output array for freq mode k
-  double err = cabs(F[kout] - Ftest)/Fmax;
-  printf("1D type 1 NUFFT done. ier=%d, err in F[%d] rel to max(F) is %.3g\n",ier,k,err);
+  int kout   = k + N / 2; // index in output array for freq mode k
+  double err = cabs(F[kout] - Ftest) / Fmax;
+  printf("1D type 1 NUFFT done. ier=%d, err in F[%d] rel to max(F) is %.3g\n", ier, k,
+         err);
 
-  free(x); free(c); free(F);
+  free(x);
+  free(c);
+  free(F);
   return ier;
 }
diff --git a/examples/simple1d1cf.c b/examples/simple1d1cf.c
index db79c06e1..c66ecc688 100644
--- a/examples/simple1d1cf.c
+++ b/examples/simple1d1cf.c
@@ -2,60 +2,63 @@
 #include <finufft.h>
 
 // also needed for this example...
-#include <stdlib.h>
-#include <math.h>
+#include <assert.h>
 #include <complex.h>
+#include <math.h>
 #include <stdio.h>
-#include <assert.h>
+#include <stdlib.h>
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Simple example of calling the FINUFFT library from C, using C complex type,
    with a math test. Single-precision version. C99 style. opts is a struct.
 
    Compile with:
-   gcc -fopenmp example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lfftw3f -lfftw3f_omp -lm -lstdc++
-   or if you have built a single-core version:
-   gcc example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lfftw3f -lm -lstdc++
+   gcc -fopenmp example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf
+   -lfftw3f -lfftw3f_omp -lm -lstdc++ or if you have built a single-core version: gcc
+   example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lfftw3f -lm
+   -lstdc++
 
    Usage: ./example1d1cf
 */
 {
-  int M = 1e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes (NB if too large lose acc in 1d)
-  float tol = 1e-3;       // desired accuracy
+  int M     = 1e5;  // number of nonuniform points
+  int N     = 1e5;  // number of modes (NB if too large lose acc in 1d)
+  float tol = 1e-3; // desired accuracy
 
   // generate some random nonuniform points (x) and complex strengths (c):
-  float* x = (float *)malloc(sizeof(float)*M);
-  float complex* c = (float complex*)malloc(sizeof(float complex)*M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((float)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-    c[j] = 2*((float)rand()/RAND_MAX)-1 + I*(2*((float)rand()/RAND_MAX)-1);
+  float *x         = (float *)malloc(sizeof(float) * M);
+  float complex *c = (float complex *)malloc(sizeof(float complex) * M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((float)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] = 2 * ((float)rand() / RAND_MAX) - 1 + I * (2 * ((float)rand() / RAND_MAX) - 1);
   }
   // allocate complex output array for the Fourier modes
-  float complex* F = (float complex*)malloc(sizeof(float complex)*N);
+  float complex *F = (float complex *)malloc(sizeof(float complex) * N);
+
+  finufft_opts opts;            // opts struct (not ptr)
+  finufftf_default_opts(&opts); // set default opts (must do this)
+  opts.debug = 2;               // show how to override a default
+  // opts.upsampfac = 1.25;                 // other opts...
 
-  finufft_opts opts;                         // opts struct (not ptr)
-  finufftf_default_opts(&opts);            // set default opts (must do this)
-  opts.debug = 2;                          // show how to override a default
-  //opts.upsampfac = 1.25;                 // other opts...
-  
   // call the NUFFT (with iflag=+1), passing pointers...
-  int ier = finufftf1d1(M,x,c,+1,tol,N,F,&opts);
-
-  int k = 14251;          // check the answer just for this mode...
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  float complex Ftest = 0.0f + 0.0f*I;    // defined in complex.h (I too)
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * cexpf(I*(float)k*x[j]);
-  float Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  int ier = finufftf1d1(M, x, c, +1, tol, N, F, &opts);
+
+  int k = 14251;                         // check the answer just for this mode...
+  assert(k >= -(double)N / 2 && k < (double)N / 2);
+  float complex Ftest = 0.0f + 0.0f * I; // defined in complex.h (I too)
+  for (int j = 0; j < M; ++j) Ftest += c[j] * cexpf(I * (float)k * x[j]);
+  float Fmax = 0.0;                      // compute inf norm of F
+  for (int m = 0; m < N; ++m) {
     float aF = cabsf(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  int kout = k+N/2;       // index in output array for freq mode k
-  float err = cabsf(F[kout] - Ftest)/Fmax;
-  printf("1D type 1 NUFFT, single-prec. ier=%d, err in F[%d] rel to max(F) is %.3g\n",ier,k,err);
+  int kout  = k + N / 2; // index in output array for freq mode k
+  float err = cabsf(F[kout] - Ftest) / Fmax;
+  printf("1D type 1 NUFFT, single-prec. ier=%d, err in F[%d] rel to max(F) is %.3g\n",
+         ier, k, err);
 
-  free(x); free(c); free(F);
+  free(x);
+  free(c);
+  free(F);
   return ier;
 }
diff --git a/examples/simple1d1f.cpp b/examples/simple1d1f.cpp
index fea98b8d6..3882d8ea1 100644
--- a/examples/simple1d1f.cpp
+++ b/examples/simple1d1f.cpp
@@ -2,58 +2,58 @@
 #include <finufft.h>
 
 // also needed for this example...
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Example of calling the FINUFFT library from C++, using STL
    single complex vectors, with a math test.
    (See simple1d1 for double-precision version.)
 
    Compile with:
-   g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lfftw3f_omp -lm
-   or if you have built a single-core version:
-   g++ simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lm
+   g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f
+   -lfftw3f -lfftw3f_omp -lm or if you have built a single-core version: g++
+   simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lm
 
    Usage: ./simple1d1f
 */
 {
-  int M = 1e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes (NB if too large lose acc in 1d)
-  float acc = 1e-3;       // desired accuracy
-  finufft_opts* opts = new finufft_opts;     // opts is pointer to struct
-  finufftf_default_opts(opts);   // note finufft "f" suffix
-  complex<float> I = complex<float>(0.0,1.0);  // the imaginary unit
-  
+  int M              = 1e5;  // number of nonuniform points
+  int N              = 1e5;  // number of modes (NB if too large lose acc in 1d)
+  float acc          = 1e-3; // desired accuracy
+  finufft_opts *opts = new finufft_opts;       // opts is pointer to struct
+  finufftf_default_opts(opts);                 // note finufft "f" suffix
+  complex<float> I = complex<float>(0.0, 1.0); // the imaginary unit
+
   // generate some random nonuniform points (x) and complex strengths (c)...
   vector<float> x(M);
-  vector<complex<float> > c(M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((float)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-    c[j] = 2*((float)rand()/RAND_MAX)-1 + I*(2*((float)rand()/RAND_MAX)-1);
+  vector<complex<float>> c(M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((float)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] = 2 * ((float)rand() / RAND_MAX) - 1 + I * (2 * ((float)rand() / RAND_MAX) - 1);
   }
   // allocate output array for the Fourier modes...
-  vector<complex<float> > F(N);
+  vector<complex<float>> F(N);
 
   // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
-  int ier = finufftf1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts);   // note "f"
+  int ier = finufftf1d1(M, &x[0], &c[0], +1, acc, N, &F[0], opts); // note "f"
 
-  int k = 14251;   // check the answer just for this mode...
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  complex<float> Ftest = complex<float>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(I*(float)k*x[j]);
-  float Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  int k = 14251; // check the answer just for this mode...
+  assert(k >= -(double)N / 2 && k < (double)N / 2);
+  complex<float> Ftest = complex<float>(0, 0);
+  for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (float)k * x[j]);
+  float Fmax = 0.0; // compute inf norm of F
+  for (int m = 0; m < N; ++m) {
     float aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  int kout = k+N/2;       // index in output array for freq mode k
-  float err = abs(F[kout] - Ftest)/Fmax;
-  printf("1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ier,k,err);
+  int kout  = k + N / 2; // index in output array for freq mode k
+  float err = abs(F[kout] - Ftest) / Fmax;
+  printf("1D type-1 single-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n", ier, k,
+         err);
   return ier;
 }
diff --git a/examples/simple2d1.cpp b/examples/simple2d1.cpp
index cf912445b..91cce0bd1 100644
--- a/examples/simple2d1.cpp
+++ b/examples/simple2d1.cpp
@@ -1,76 +1,79 @@
 // this is all you must include for the finufft lib...
-#include <finufft.h>
 #include <complex>
+#include <finufft.h>
 
 // also needed for this example...
-#include <iostream>
 #include <iomanip>
+#include <iostream>
 #include <vector>
 using namespace std;
 
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[]) {
 
-/* Simple 2D type-1 example of calling the FINUFFT library from C++, using plain
-   arrays of C++ complex numbers, with a math test. Double precision version. 
+  /* Simple 2D type-1 example of calling the FINUFFT library from C++, using plain
+     arrays of C++ complex numbers, with a math test. Double precision version.
 
-   Compile multithreaded with
-   g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lfftw3_omp -lm
-   single core with:
-   g++ simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lm
-   
-   Usage:  ./simple2d1
-*/
+     Compile multithreaded with
+     g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3
+     -lfftw3_omp -lm single core with: g++ simple2d1.cpp -I ../src
+     ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lm
 
-  int M = 1e6;                 // number of nonuniform points
-  int N = 1e6;                 // approximate total number of modes (N1*N2)
-  double tol = 1e-6;           // desired accuracy
-  finufft_opts opts; finufft_default_opts(&opts);
+     Usage:  ./simple2d1
+  */
+
+  int M      = 1e6;  // number of nonuniform points
+  int N      = 1e6;  // approximate total number of modes (N1*N2)
+  double tol = 1e-6; // desired accuracy
+  finufft_opts opts;
+  finufft_default_opts(&opts);
   complex<double> I(0.0, 1.0); // the imaginary unit
 
   // generate random non-uniform points on (x,y) and complex strengths (c):
   vector<double> x(M), y(M);
-  vector<complex<double> > c(M);
+  vector<complex<double>> c(M);
 
-  for(int i = 0; i < M; i++){
-    x[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi)
-    y[i] = M_PI*(2*(double)rand()/RAND_MAX-1); //uniform random in [-pi, pi)
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi)
+    y[i] = M_PI * (2 * (double)rand() / RAND_MAX - 1); // uniform random in [-pi, pi)
 
     // each component uniform random in [-1,1]
-    c[i] = 2*((double)rand()/RAND_MAX-1) + I*(2*((double)rand()/RAND_MAX)-1); 
+    c[i] =
+        2 * ((double)rand() / RAND_MAX - 1) + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
 
   // choose numbers of output Fourier coefficients in each dimension
-  int N1 = round(2.0*sqrt(N));
-  int N2 = round(N/N1);
-  
+  int N1 = round(2.0 * sqrt(N));
+  int N2 = round(N / N1);
+
   // output array for the Fourier modes
-  vector<complex<double> > F(N1*N2);
+  vector<complex<double>> F(N1 * N2);
 
   // call the NUFFT (with iflag += 1): note passing in pointers...
   opts.upsampfac = 1.25;
-  int ier = finufft2d1(M,&x[0],&y[0], &c[0], 1, tol, N1, N2, &F[0], &opts);
+  int ier        = finufft2d1(M, &x[0], &y[0], &c[0], 1, tol, N1, N2, &F[0], &opts);
 
-  int k1 = round(0.45*N1);    // check the answer for mode frequency (k1,k2)
-  int k2 = round(-0.35*N2);
-  
-  complex<double> Ftest(0,0);
-  for(int j = 0; j < M; j++)
-    Ftest += c[j]*exp(I*((double)k1*x[j]+(double)k2*y[j]));
+  int k1 = round(0.45 * N1); // check the answer for mode frequency (k1,k2)
+  int k2 = round(-0.35 * N2);
 
-  // compute inf norm of F 
+  complex<double> Ftest(0, 0);
+  for (int j = 0; j < M; j++)
+    Ftest += c[j] * exp(I * ((double)k1 * x[j] + (double)k2 * y[j]));
+
+  // compute inf norm of F
   double Fmax = 0.0;
-  for (int m=0; m<N1*N2; m++) {
+  for (int m = 0; m < N1 * N2; m++) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  
+
   // indices in output array for this frequency pair (k1,k2)
-  int k1out = k1 + N1/2; 
-  int k2out = k2 + N2/2;
-  int indexOut = k1out + k2out*(N1);
+  int k1out    = k1 + N1 / 2;
+  int k2out    = k2 + N2 / 2;
+  int indexOut = k1out + k2out * (N1);
 
   // compute relative error
-  double err = abs(F[indexOut] - Ftest)/Fmax;
-  cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut << "] rel to max(F) is " << setprecision(2) << err << endl;
+  double err = abs(F[indexOut] - Ftest) / Fmax;
+  cout << "2D type-1 NUFFT done. ier=" << ier << ", err in F[" << indexOut
+       << "] rel to max(F) is " << setprecision(2) << err << endl;
   return ier;
 }
diff --git a/examples/simulplans1d1.cpp b/examples/simulplans1d1.cpp
index b814876a2..4fb5f9449 100644
--- a/examples/simulplans1d1.cpp
+++ b/examples/simulplans1d1.cpp
@@ -2,37 +2,41 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <cstdio>
 #include <stdlib.h>
-#include <cassert>
+#include <vector>
 using namespace std;
 
-void strengths(vector<complex<double>>& c) {    // fill random complex array
-  for (long unsigned int j=0; j<c.size(); ++j)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+void strengths(vector<complex<double>> &c) { // fill random complex array
+  for (long unsigned int j = 0; j < c.size(); ++j)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + 1i * (2 * ((double)rand() / RAND_MAX) - 1);
 }
 
-double chk1d1(int n, vector<double>& x, vector<complex<double>>& c,
-              vector<complex<double>>& F)
+double chk1d1(int n, vector<double> &x, vector<complex<double>> &c,
+              vector<complex<double>> &F)
 // return error in output array F, for n'th mode only, rel to ||F||_inf
 {
   int N = F.size();
-  if (n>=N/2 || n<-N/2) { printf("n out of bounds!\n"); return NAN; }
-  complex<double> Ftest = complex<double>(0,0);
-  for (long unsigned int j=0; j<x.size(); ++j)
-    Ftest += c[j] * exp(1i*(double)n*x[j]);
-  int nout = n+N/2;        // index in output array for freq mode n
+  if (n >= N / 2 || n < -N / 2) {
+    printf("n out of bounds!\n");
+    return NAN;
+  }
+  complex<double> Ftest = complex<double>(0, 0);
+  for (long unsigned int j = 0; j < x.size(); ++j)
+    Ftest += c[j] * exp(1i * (double)n * x[j]);
+  int nout    = n + N / 2; // index in output array for freq mode n
   double Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
+  for (int m = 0; m < N; ++m) {
     double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
+    if (aF > Fmax) Fmax = aF;
   }
-  return abs(F[nout] - Ftest)/Fmax;
+  return abs(F[nout] - Ftest) / Fmax;
 }
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Demo two simultaneous FINUFFT plans (A,B) being handled in C++ without
    interacting (or at least without crashing; note that FFTW initialization
    is the only global state of FINUFFT library).
@@ -40,20 +44,21 @@ int main(int argc, char* argv[])
    Edited from guru1d1, Barnett 2/15/22
 
    Compile & run:
-   g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1 -lfftw3 -lfftw3_omp -lm && ./simulplans1d1
+   g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1
+   -lfftw3 -lfftw3_omp -lm && ./simulplans1d1
 */
 {
-  double tol = 1e-9;      // desired accuracy for both plans
+  double tol = 1e-9;         // desired accuracy for both plans
   int type = 1, dim = 1;     // 1d1
-  int64_t Ns[3];           // guru describes mode array by vector [N1,N2..]
+  int64_t Ns[3];             // guru describes mode array by vector [N1,N2..]
   int ntransf = 1;           // we want to do a single transform at a time
-  
-  int MA = 3e6;            // number of nonuniform points    PLAN A
-  int NA = 1e6;            // number of modes
-  int MB = 2e6;            // number of nonuniform points    PLAN B, diff sizes
-  int NB = 1e5;            // number of modes
 
-  finufft_plan planA, planB;         // creates plan structs
+  int MA = 3e6;              // number of nonuniform points    PLAN A
+  int NA = 1e6;              // number of modes
+  int MB = 2e6;              // number of nonuniform points    PLAN B, diff sizes
+  int NB = 1e5;              // number of modes
+
+  finufft_plan planA, planB; // creates plan structs
   Ns[0] = NA;
   finufft_makeplan(type, dim, Ns, +1, ntransf, tol, &planA, NULL);
   Ns[0] = NB;
@@ -61,22 +66,22 @@ int main(int argc, char* argv[])
 
   // generate some random nonuniform points
   vector<double> xA(MA), xB(MB);
-  for (int j=0; j<MA; ++j)
-    xA[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-  for (int j=0; j<MB; ++j)
-    xB[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-  
+  for (int j = 0; j < MA; ++j)
+    xA[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+  for (int j = 0; j < MB; ++j)
+    xB[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+
   // note FINUFFT doesn't use std::vector types, so we need to make a pointer...
   finufft_setpts(planA, MA, &xA[0], NULL, NULL, 0, NULL, NULL, NULL);
   finufft_setpts(planB, MB, &xB[0], NULL, NULL, 0, NULL, NULL, NULL);
-  
+
   // generate some complex strengths
   vector<complex<double>> cA(MA), cB(MB);
   strengths(cA);
   strengths(cB);
-  
+
   // allocate output arrays for the Fourier modes...
-  vector<complex<double> > FA(NA), FB(NB);
+  vector<complex<double>> FA(NA), FB(NB);
   int ierA = finufft_execute(planA, &cA[0], &FA[0]);
   int ierB = finufft_execute(planB, &cB[0], &FB[0]);
 
@@ -87,14 +92,16 @@ int main(int argc, char* argv[])
   ierB = finufft_execute(planB, &cB[0], &FB[0]);
   finufft_destroy(planA);
   finufft_destroy(planB);
-  
+
   // math checking and reporting...
-  int n = 116354;
-  double errA = chk1d1(n,xA,cA,FA);
-  printf("planA: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ierA,n,errA);
-  n = 27152;
-  double errB = chk1d1(n,xB,cB,FB);
-  printf("planB: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",ierB,n,errB);
+  int n       = 116354;
+  double errA = chk1d1(n, xA, cA, FA);
+  printf("planA: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",
+         ierA, n, errA);
+  n           = 27152;
+  double errB = chk1d1(n, xB, cB, FB);
+  printf("planB: 1D type-1 double-prec NUFFT done. ier=%d, rel err in F[%d] is %.3g\n",
+         ierB, n, errB);
 
   return ierA + ierB;
 }
diff --git a/examples/threadsafe1d1.cpp b/examples/threadsafe1d1.cpp
index f25f25b8b..da267fa6c 100644
--- a/examples/threadsafe1d1.cpp
+++ b/examples/threadsafe1d1.cpp
@@ -2,15 +2,15 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
+#include <cassert>
 #include <complex>
 #include <cstdio>
-#include <stdlib.h>
-#include <cassert>
 #include <omp.h>
+#include <stdlib.h>
+#include <vector>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Demo single-threaded FINUFFT calls from inside a OMP parallel block.
    Adapted from simple1d1.cpp: C++, STL double complex vectors, with math test.
    Barnett 4/19/21, eg for Goran Zauhar, issue #183. Also see: many1d1.cpp.
@@ -26,50 +26,51 @@ int main(int argc, char* argv[])
    reporting small error.
 */
 {
-  int M = 1e5;            // number of nonuniform points
-  int N = 1e5;            // number of modes
-  double acc = 1e-9;      // desired accuracy
-  finufft_opts* opts = new finufft_opts;     // opts is pointer to struct
+  int M              = 1e5;                      // number of nonuniform points
+  int N              = 1e5;                      // number of modes
+  double acc         = 1e-9;                     // desired accuracy
+  finufft_opts *opts = new finufft_opts;         // opts is pointer to struct
   finufft_default_opts(opts);
-  complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
-  
-  opts->nthreads=1;       // *crucial* so that each call single-thread (otherwise segfaults)
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
+
+  opts->nthreads = 1; // *crucial* so that each call single-thread (otherwise segfaults)
 
   // Now have each thread do independent 1D type 1 on their own data:
 #pragma omp parallel
   {
-  // generate some random nonuniform points (x) and complex strengths (c)...
-  // Note that these are local to the thread (if you have the *same* sets of
-  // NU pts x for each thread, consider instead using one vectorized multithreaded
-  // transform, which would be faster).
-  vector<double> x(M);
-  vector<complex<double> > c(M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
-  }
-    
-  // allocate output array for the Fourier modes... local to the thread
-  vector<complex<double> > F(N);
+    // generate some random nonuniform points (x) and complex strengths (c)...
+    // Note that these are local to the thread (if you have the *same* sets of
+    // NU pts x for each thread, consider instead using one vectorized multithreaded
+    // transform, which would be faster).
+    vector<double> x(M);
+    vector<complex<double>> c(M);
+    for (int j = 0; j < M; ++j) {
+      x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+      c[j] =
+          2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
+    }
 
-  // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
-  int ier = finufft1d1(M,&x[0],&c[0],+1,acc,N,&F[0],opts);
+    // allocate output array for the Fourier modes... local to the thread
+    vector<complex<double>> F(N);
 
-  int k = 42519;       // check the answer just for this mode frequency...
-  assert(k>=-(double)N/2 && k<(double)N/2);
-  complex<double> Ftest = complex<double>(0,0);
-  for (int j=0; j<M; ++j)
-    Ftest += c[j] * exp(I*(double)k*x[j]);
-  double Fmax = 0.0;       // compute inf norm of F
-  for (int m=0; m<N; ++m) {
-    double aF = abs(F[m]);
-    if (aF>Fmax) Fmax=aF;
-  }
-  int kout = k+N/2;        // index in output array for freq mode k
-  double err = abs(F[kout] - Ftest)/Fmax;
-  
-  printf("[thread %2d] 1D t-1 dbl-prec NUFFT done. ier=%d, rel err in F[%d]: %.3g\n",omp_get_thread_num(),ier,k,err);
+    // call the NUFFT (with iflag=+1): note pointers (not STL vecs) passed...
+    int ier = finufft1d1(M, &x[0], &c[0], +1, acc, N, &F[0], opts);
+
+    int k = 42519; // check the answer just for this mode frequency...
+    assert(k >= -(double)N / 2 && k < (double)N / 2);
+    complex<double> Ftest = complex<double>(0, 0);
+    for (int j = 0; j < M; ++j) Ftest += c[j] * exp(I * (double)k * x[j]);
+    double Fmax = 0.0; // compute inf norm of F
+    for (int m = 0; m < N; ++m) {
+      double aF = abs(F[m]);
+      if (aF > Fmax) Fmax = aF;
+    }
+    int kout   = k + N / 2; // index in output array for freq mode k
+    double err = abs(F[kout] - Ftest) / Fmax;
+
+    printf("[thread %2d] 1D t-1 dbl-prec NUFFT done. ier=%d, rel err in F[%d]: %.3g\n",
+           omp_get_thread_num(), ier, k, err);
   }
-  
+
   return 0;
 }
diff --git a/examples/threadsafe2d2f.cpp b/examples/threadsafe2d2f.cpp
index 9844af54a..e2ad64bb1 100644
--- a/examples/threadsafe2d2f.cpp
+++ b/examples/threadsafe2d2f.cpp
@@ -8,7 +8,8 @@
 
    To compile (note uses threads rather than omp version of FFTW3):
 
-   g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -g -Wall
+   g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -g
+   -Wall
 
    ./threadsafe2d2f                                   <-- use all threads
    OMP_NUM_THREADS=1 ./threadsafe2d2f                 <-- sequential, 1 thread
@@ -23,43 +24,43 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
 #include <complex>
 #include <iostream>
 #include <omp.h>
+#include <vector>
 using namespace std;
 
-int test_finufft(finufft_opts* opts)
-  // self-contained small test that one single-prec FINUFFT2D2 has no error/crash
+int test_finufft(finufft_opts *opts)
+// self-contained small test that one single-prec FINUFFT2D2 has no error/crash
 {
-  size_t n_rows = 256, n_cols = 256;       // 2d image size
-  size_t n_read = 512, n_spokes = 128;     // some k-space point params
-  size_t M = n_read*n_spokes;              // how many k-space pts; MRI-specific
-  std::vector<float> x(M);                 // bunch of zero input data
+  size_t n_rows = 256, n_cols = 256;   // 2d image size
+  size_t n_read = 512, n_spokes = 128; // some k-space point params
+  size_t M = n_read * n_spokes;        // how many k-space pts; MRI-specific
+  std::vector<float> x(M);             // bunch of zero input data
   std::vector<float> y(M);
-  std::vector<std::complex<float>> img(n_rows * n_cols);    // coeffs
-  std::vector<std::complex<float>> ksp(M);     // output array (vals @ k-space pts)
+  std::vector<std::complex<float>> img(n_rows * n_cols); // coeffs
+  std::vector<std::complex<float>> ksp(M); // output array (vals @ k-space pts)
 
-  int ier = finufftf2d2(M, x.data(), y.data(), ksp.data(),
-                        -1, 1e-3, n_rows, n_cols, img.data(), opts);
+  int ier = finufftf2d2(M, x.data(), y.data(), ksp.data(), -1, 1e-3, n_rows, n_cols,
+                        img.data(), opts);
 
-  std::cout << "\ttest_finufft: exit code " << ier << ", thread " << omp_get_thread_num() << std::endl;
+  std::cout << "\ttest_finufft: exit code " << ier << ", thread " << omp_get_thread_num()
+            << std::endl;
   return ier;
 }
 
-int main(int argc, char* argv[])
-{
+int main(int argc, char *argv[]) {
   finufft_opts opts;
   finufftf_default_opts(&opts);
-  opts.nthreads = 1;     // *crucial* so each call single-thread; else segfaults
+  opts.nthreads = 1;      // *crucial* so each call single-thread; else segfaults
 
-  int n_slices = 50;     // number of transforms. parallelize over slices
-  int overallstatus=0;
+  int n_slices      = 50; // number of transforms. parallelize over slices
+  int overallstatus = 0;
 #pragma omp parallel for
   for (int i = 0; i < n_slices; i++) {
     int ier = test_finufft(&opts);
-    if (ier!=0) overallstatus=1;
+    if (ier != 0) overallstatus = 1;
   }
-  
+
   return overallstatus;
 }
diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp
index 9f415d647..799a10041 100644
--- a/fortran/finufftfort.cpp
+++ b/fortran/finufftfort.cpp
@@ -26,205 +26,182 @@
 // local prec-switching macros for fortran names, ie
 // underscore-suffixed versions of those at end of defs.h
 #define FINUFFT_DEFAULT_OPTS_ FINUFFTIFY(_default_opts_)
-#define FINUFFT_MAKEPLAN_ FINUFFTIFY(_makeplan_)
-#define FINUFFT_SETPTS_ FINUFFTIFY(_setpts_)
-#define FINUFFT_EXECUTE_ FINUFFTIFY(_execute_)
-#define FINUFFT_DESTROY_ FINUFFTIFY(_destroy_)
-#define FINUFFT1D1_ FINUFFTIFY(1d1_)
-#define FINUFFT1D2_ FINUFFTIFY(1d2_)
-#define FINUFFT1D3_ FINUFFTIFY(1d3_)
-#define FINUFFT2D1_ FINUFFTIFY(2d1_)
-#define FINUFFT2D2_ FINUFFTIFY(2d2_)
-#define FINUFFT2D3_ FINUFFTIFY(2d3_)
-#define FINUFFT3D1_ FINUFFTIFY(3d1_)
-#define FINUFFT3D2_ FINUFFTIFY(3d2_)
-#define FINUFFT3D3_ FINUFFTIFY(3d3_)
-#define FINUFFT1D1MANY_ FINUFFTIFY(1d1many_)
-#define FINUFFT1D2MANY_ FINUFFTIFY(1d2many_)
-#define FINUFFT1D3MANY_ FINUFFTIFY(1d3many_)
-#define FINUFFT2D1MANY_ FINUFFTIFY(2d1many_)
-#define FINUFFT2D2MANY_ FINUFFTIFY(2d2many_)
-#define FINUFFT2D3MANY_ FINUFFTIFY(2d3many_)
-#define FINUFFT3D1MANY_ FINUFFTIFY(3d1many_)
-#define FINUFFT3D2MANY_ FINUFFTIFY(3d2many_)
-#define FINUFFT3D3MANY_ FINUFFTIFY(3d3many_)
+#define FINUFFT_MAKEPLAN_     FINUFFTIFY(_makeplan_)
+#define FINUFFT_SETPTS_       FINUFFTIFY(_setpts_)
+#define FINUFFT_EXECUTE_      FINUFFTIFY(_execute_)
+#define FINUFFT_DESTROY_      FINUFFTIFY(_destroy_)
+#define FINUFFT1D1_           FINUFFTIFY(1d1_)
+#define FINUFFT1D2_           FINUFFTIFY(1d2_)
+#define FINUFFT1D3_           FINUFFTIFY(1d3_)
+#define FINUFFT2D1_           FINUFFTIFY(2d1_)
+#define FINUFFT2D2_           FINUFFTIFY(2d2_)
+#define FINUFFT2D3_           FINUFFTIFY(2d3_)
+#define FINUFFT3D1_           FINUFFTIFY(3d1_)
+#define FINUFFT3D2_           FINUFFTIFY(3d2_)
+#define FINUFFT3D3_           FINUFFTIFY(3d3_)
+#define FINUFFT1D1MANY_       FINUFFTIFY(1d1many_)
+#define FINUFFT1D2MANY_       FINUFFTIFY(1d2many_)
+#define FINUFFT1D3MANY_       FINUFFTIFY(1d3many_)
+#define FINUFFT2D1MANY_       FINUFFTIFY(2d1many_)
+#define FINUFFT2D2MANY_       FINUFFTIFY(2d2many_)
+#define FINUFFT2D3MANY_       FINUFFTIFY(2d3many_)
+#define FINUFFT3D1MANY_       FINUFFTIFY(3d1many_)
+#define FINUFFT3D2MANY_       FINUFFTIFY(3d2many_)
+#define FINUFFT3D3MANY_       FINUFFTIFY(3d3many_)
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-  
+
 // --------------------- guru interface from fortran ------------------------
-void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier)
-{
+void FINUFFT_MAKEPLAN_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf,
+                       FLT *tol, FINUFFT_PLAN *plan, finufft_opts *o, int *ier) {
   if (!plan)
-    fprintf(stderr,"%s fortran: plan must be allocated as at least the size of a C pointer (usually 8 bytes)!\n",__func__);
+    fprintf(stderr,
+            "%s fortran: plan must be allocated as at least the size of a C pointer "
+            "(usually 8 bytes)!\n",
+            __func__);
   else {
     // pass o whether it's a NULL or pointer to a fortran-allocated finufft_opts:
     *ier = FINUFFT_MAKEPLAN(*type, *n_dims, n_modes, *iflag, *n_transf, *tol, plan, o);
   }
 }
 
-void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk, FLT *s, FLT *t, FLT *u, int *ier)
-{
+void FINUFFT_SETPTS_(FINUFFT_PLAN *plan, BIGINT *M, FLT *xj, FLT *yj, FLT *zj, BIGINT *nk,
+                     FLT *s, FLT *t, FLT *u, int *ier) {
   if (!*plan) {
-    fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__);
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
     return;
   }
-  int nk_safe = 0;           // catches the case where user passes NULL in
-  if (nk)
-    nk_safe = *nk;
+  int nk_safe = 0; // catches the case where user passes NULL in
+  if (nk) nk_safe = *nk;
   *ier = FINUFFT_SETPTS(*plan, *M, xj, yj, zj, nk_safe, s, t, u);
 }
 
-void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier)
-{
+void FINUFFT_EXECUTE_(FINUFFT_PLAN *plan, CPX *weights, CPX *result, int *ier) {
   if (!plan)
-    fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__);
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
   else
     *ier = FINUFFT_EXECUTE(*plan, weights, result);
 }
 
-void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier)
-{
+void FINUFFT_DESTROY_(FINUFFT_PLAN *plan, int *ier) {
   if (!plan)
-    fprintf(stderr,"%s fortran: finufft_plan unallocated!",__func__);
+    fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__);
   else
     *ier = FINUFFT_DESTROY(*plan);
 }
 
-  
 // ------------ use FINUFFT to set the default options ---------------------
 // (Note the finufft_opts is created in f90-style derived types, not here)
-void FINUFFT_DEFAULT_OPTS_(finufft_opts* o)
-{
+void FINUFFT_DEFAULT_OPTS_(finufft_opts *o) {
   if (!o)
-    fprintf(stderr,"%s fortran: opts must be allocated!\n",__func__);
+    fprintf(stderr, "%s fortran: opts must be allocated!\n", __func__);
   else
     // o is a ptr to already-allocated fortran finufft_opts derived type...
     FINUFFT_DEFAULT_OPTS(o);
 }
 
-
 // -------------- simple and many-vector interfaces --------------------
 // --- 1D ---
-void FINUFFT1D1_(BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D1(*nj,xj,cj,*iflag,*eps,*ms,fk,o);
+void FINUFFT1D1_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk,
+                 finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D1(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D1MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D1MANY(*ntransf,*nj,xj,cj,*iflag,*eps,*ms,fk,o);
+void FINUFFT1D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps,
+                     BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D1MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D2_(BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D2(*nj,xj,cj,*iflag,*eps,*ms,fk,o);
+void FINUFFT1D2_(BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms, CPX *fk,
+                 finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D2(*nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D2MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D2MANY(*ntransf,*nj,xj,cj,*iflag,*eps,*ms,fk,o);
+void FINUFFT1D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, CPX *cj, int *iflag, FLT *eps,
+                     BIGINT *ms, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D2MANY(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o);
 }
 
-void FINUFFT1D3_(BIGINT* nj, FLT* x, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D3(*nj,x,c,*iflag,*eps,*nk,s,f,o);
+void FINUFFT1D3_(BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps, BIGINT *nk, FLT *s,
+                 CPX *f, finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D3(*nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
-void FINUFFT1D3MANY_(int* ntransf,
-                 BIGINT* nj, FLT* x, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT1D3MANY(*ntransf,*nj,x,c,*iflag,*eps,*nk,s,f,o);
+void FINUFFT1D3MANY_(int *ntransf, BIGINT *nj, FLT *x, CPX *c, int *iflag, FLT *eps,
+                     BIGINT *nk, FLT *s, CPX *f, finufft_opts *o, int *ier) {
+  *ier = FINUFFT1D3MANY(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o);
 }
 
 // --- 2D ---
-void FINUFFT2D1_(BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D1(*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o);
+void FINUFFT2D1_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms,
+                 BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT2D1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void FINUFFT2D1MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D1MANY(*ntransf,*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o);
+void FINUFFT2D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag,
+                     FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o,
+                     int *ier) {
+  *ier = FINUFFT2D1MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
 
-void FINUFFT2D2_(BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D2(*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o);
+void FINUFFT2D2_(BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag, FLT *eps, BIGINT *ms,
+                 BIGINT *mt, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT2D2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
-void FINUFFT2D2MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, FLT* yj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D2MANY(*ntransf,*nj,xj,yj,cj,*iflag,*eps,*ms,*mt,fk,o);
+void FINUFFT2D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, CPX *cj, int *iflag,
+                     FLT *eps, BIGINT *ms, BIGINT *mt, CPX *fk, finufft_opts *o,
+                     int *ier) {
+  *ier = FINUFFT2D2MANY(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o);
 }
 
-void FINUFFT2D3_(BIGINT* nj, FLT* x, FLT* y, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, FLT* t, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D3(*nj,x,y,c,*iflag,*eps,*nk,s,t,f,o);
+void FINUFFT2D3_(BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag, FLT *eps, BIGINT *nk,
+                 FLT *s, FLT *t, CPX *f, finufft_opts *o, int *ier) {
+  *ier = FINUFFT2D3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
-void FINUFFT2D3MANY_(int* ntransf,
-                 BIGINT* nj, FLT* x, FLT* y, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, FLT* t, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT2D3MANY(*ntransf,*nj,x,y,c,*iflag,*eps,*nk,s,t,f,o);
+void FINUFFT2D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, CPX *c, int *iflag,
+                     FLT *eps, BIGINT *nk, FLT *s, FLT *t, CPX *f, finufft_opts *o,
+                     int *ier) {
+  *ier = FINUFFT2D3MANY(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o);
 }
 
 // --- 3D ---
-void FINUFFT3D1_(BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D1(*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o);
+void FINUFFT3D1_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps,
+                 BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT3D1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D1MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D1MANY(*ntransf,*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o);
+void FINUFFT3D1MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj,
+                     int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk,
+                     finufft_opts *o, int *ier) {
+  *ier =
+      FINUFFT3D1MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D2_(BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D2(*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o);
+void FINUFFT3D2_(BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int *iflag, FLT *eps,
+                 BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk, finufft_opts *o, int *ier) {
+  *ier = FINUFFT3D2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D2MANY_(int* ntransf,
-                 BIGINT* nj, FLT* xj, FLT* yj, FLT* zj, CPX* cj, int* iflag, FLT* eps,
-                 BIGINT* ms, BIGINT* mt, BIGINT* mu, CPX* fk, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D2MANY(*ntransf,*nj,xj,yj,zj,cj,*iflag,*eps,*ms,*mt,*mu,fk,o);
+void FINUFFT3D2MANY_(int *ntransf, BIGINT *nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj,
+                     int *iflag, FLT *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, CPX *fk,
+                     finufft_opts *o, int *ier) {
+  *ier =
+      FINUFFT3D2MANY(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o);
 }
 
-void FINUFFT3D3_(BIGINT* nj, FLT* x, FLT* y, FLT* z, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, FLT* t, FLT* u, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D3(*nj,x,y,z,c,*iflag,*eps,*nk,s,t,u,f,o);
+void FINUFFT3D3_(BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag, FLT *eps,
+                 BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f, finufft_opts *o, int *ier) {
+  *ier = FINUFFT3D3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
-void FINUFFT3D3MANY_(int* ntransf,
-                 BIGINT* nj, FLT* x, FLT* y, FLT* z, CPX* c, int* iflag, FLT* eps,
-                 BIGINT* nk, FLT* s, FLT* t, FLT* u, CPX* f, finufft_opts* o, int* ier)
-{
-  *ier = FINUFFT3D3MANY(*ntransf,*nj,x,y,z,c,*iflag,*eps,*nk,s,t,u,f,o);
+void FINUFFT3D3MANY_(int *ntransf, BIGINT *nj, FLT *x, FLT *y, FLT *z, CPX *c, int *iflag,
+                     FLT *eps, BIGINT *nk, FLT *s, FLT *t, FLT *u, CPX *f,
+                     finufft_opts *o, int *ier) {
+  *ier = FINUFFT3D3MANY(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o);
 }
 
-  
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/cufinufft.h b/include/cufinufft.h
index 3c498fed0..b323d94c0 100644
--- a/include/cufinufft.h
+++ b/include/cufinufft.h
@@ -14,15 +14,15 @@ extern "C" {
 #endif
 void cufinufft_default_opts(cufinufft_opts *opts);
 
-int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, double eps,
-                       cufinufft_plan *d_plan_ptr, cufinufft_opts *opts);
-int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, float eps,
-                        cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
-
-int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s,
-                     double *d_t, double *d_u);
-int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s,
-                      float *d_t, float *d_u);
+int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr,
+                       double eps, cufinufft_plan *d_plan_ptr, cufinufft_opts *opts);
+int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr,
+                        float eps, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
+
+int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z,
+                     int N, double *d_s, double *d_t, double *d_u);
+int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z,
+                      int N, float *d_s, float *d_t, float *d_u);
 
 int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuDoubleComplex *d_fk);
 int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk);
diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h
index 3ea437448..7bddc188e 100644
--- a/include/cufinufft/common.h
+++ b/include/cufinufft/common.h
@@ -10,24 +10,27 @@
 
 namespace cufinufft {
 namespace common {
-template <typename T>
-__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, cuDoubleComplex *a, T *fwkerhalf1,
-                                       T *fwkerhalf2, T *fwkerhalf3, int ns);
-template <typename T>
-int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleComplex *d_a, T *d_fwkerhalf1,
-                           T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream);
-template <typename T>
+template<typename T>
+__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
+                                       cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
+                                       T *fwkerhalf3, int ns);
+template<typename T>
+int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
+                           cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
+                           T *d_fwkerhalf3, int ns, cudaStream_t stream);
+template<typename T>
 int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts);
 
-void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, CUFINUFFT_BIGINT *nf,
-                   CUFINUFFT_BIGINT b);
-template <typename T>
+void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
+                   CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT b);
+template<typename T>
 void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts);
-template <typename T>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, finufft_spread_opts opts);
-template <typename T>
-void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, T *fwkerhalf,
+template<typename T>
+void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
                                    finufft_spread_opts opts);
+template<typename T>
+void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+                                   T *fwkerhalf, finufft_spread_opts opts);
 
 } // namespace common
 } // namespace cufinufft
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
index 69dad3b86..3f3f931c6 100644
--- a/include/cufinufft/contrib/helper_cuda.h
+++ b/include/cufinufft/contrib/helper_cuda.h
@@ -37,95 +37,97 @@
 
 #include <cufft.h>
 
-static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); }
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
 
 // This will output the proper CUDA error strings in the event
 // that a CUDA host call returns an error
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
-#define RETURN_IF_CUDA_ERROR                                                                                           \
-    {                                                                                                                  \
-        cudaError_t err = cudaGetLastError();                                                                          \
-        if (err != cudaSuccess) {                                                                                      \
-            printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err));                                             \
-            return FINUFFT_ERR_CUDA_FAILURE;                                                                           \
-        }                                                                                                              \
-    }
-
-#define CUDA_FREE_AND_NULL(val, stream)                                                                                \
-    {                                                                                                                  \
-        if (val != nullptr) {                                                                                          \
-            check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__);                                               \
-            val = nullptr;                                                                                             \
-        }                                                                                                              \
-    }
+#define RETURN_IF_CUDA_ERROR                                         \
+  {                                                                  \
+    cudaError_t err = cudaGetLastError();                            \
+    if (err != cudaSuccess) {                                        \
+      printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
+      return FINUFFT_ERR_CUDA_FAILURE;                               \
+    }                                                                \
+  }
+
+#define CUDA_FREE_AND_NULL(val, stream)                            \
+  {                                                                \
+    if (val != nullptr) {                                          \
+      check(cudaFreeAsync(val, stream), #val, __FILE__, __LINE__); \
+      val = nullptr;                                               \
+    }                                                              \
+  }
 
 static const char *cufftGetErrorString(cufftResult error) {
-    switch (error) {
-    case CUFFT_SUCCESS:
-        return "CUFFT_SUCCESS";
+  switch (error) {
+  case CUFFT_SUCCESS:
+    return "CUFFT_SUCCESS";
 
-    case CUFFT_INVALID_PLAN:
-        return "CUFFT_INVALID_PLAN";
+  case CUFFT_INVALID_PLAN:
+    return "CUFFT_INVALID_PLAN";
 
-    case CUFFT_ALLOC_FAILED:
-        return "CUFFT_ALLOC_FAILED";
+  case CUFFT_ALLOC_FAILED:
+    return "CUFFT_ALLOC_FAILED";
 
-    case CUFFT_INVALID_TYPE:
-        return "CUFFT_INVALID_TYPE";
+  case CUFFT_INVALID_TYPE:
+    return "CUFFT_INVALID_TYPE";
 
-    case CUFFT_INVALID_VALUE:
-        return "CUFFT_INVALID_VALUE";
+  case CUFFT_INVALID_VALUE:
+    return "CUFFT_INVALID_VALUE";
 
-    case CUFFT_INTERNAL_ERROR:
-        return "CUFFT_INTERNAL_ERROR";
+  case CUFFT_INTERNAL_ERROR:
+    return "CUFFT_INTERNAL_ERROR";
 
-    case CUFFT_EXEC_FAILED:
-        return "CUFFT_EXEC_FAILED";
+  case CUFFT_EXEC_FAILED:
+    return "CUFFT_EXEC_FAILED";
 
-    case CUFFT_SETUP_FAILED:
-        return "CUFFT_SETUP_FAILED";
+  case CUFFT_SETUP_FAILED:
+    return "CUFFT_SETUP_FAILED";
 
-    case CUFFT_INVALID_SIZE:
-        return "CUFFT_INVALID_SIZE";
+  case CUFFT_INVALID_SIZE:
+    return "CUFFT_INVALID_SIZE";
 
-    case CUFFT_UNALIGNED_DATA:
-        return "CUFFT_UNALIGNED_DATA";
+  case CUFFT_UNALIGNED_DATA:
+    return "CUFFT_UNALIGNED_DATA";
 
-    case CUFFT_INCOMPLETE_PARAMETER_LIST:
-        return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+  case CUFFT_INCOMPLETE_PARAMETER_LIST:
+    return "CUFFT_INCOMPLETE_PARAMETER_LIST";
 
-    case CUFFT_INVALID_DEVICE:
-        return "CUFFT_INVALID_DEVICE";
+  case CUFFT_INVALID_DEVICE:
+    return "CUFFT_INVALID_DEVICE";
 
-    case CUFFT_PARSE_ERROR:
-        return "CUFFT_PARSE_ERROR";
+  case CUFFT_PARSE_ERROR:
+    return "CUFFT_PARSE_ERROR";
 
-    case CUFFT_NO_WORKSPACE:
-        return "CUFFT_NO_WORKSPACE";
+  case CUFFT_NO_WORKSPACE:
+    return "CUFFT_NO_WORKSPACE";
 
-    case CUFFT_NOT_IMPLEMENTED:
-        return "CUFFT_NOT_IMPLEMENTED";
+  case CUFFT_NOT_IMPLEMENTED:
+    return "CUFFT_NOT_IMPLEMENTED";
 
-    case CUFFT_LICENSE_ERROR:
-        return "CUFFT_LICENSE_ERROR";
+  case CUFFT_LICENSE_ERROR:
+    return "CUFFT_LICENSE_ERROR";
 
-    case CUFFT_NOT_SUPPORTED:
-        return "CUFFT_NOT_SUPPORTED";
-    }
+  case CUFFT_NOT_SUPPORTED:
+    return "CUFFT_NOT_SUPPORTED";
+  }
 
-    return "<unknown>";
+  return "<unknown>";
 }
 
-template <typename T>
+template<typename T>
 int check(T result, char const *const func, const char *const file, int const line) {
-    if (result) {
-        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result),
-                _cudaGetErrorEnum(result), func);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
 
-    return 0;
+  return 0;
 }
 
 #endif // COMMON_HELPER_CUDA_H_
diff --git a/include/cufinufft/cudeconvolve.h b/include/cufinufft/cudeconvolve.h
index 6395f16c3..5af2f7e94 100644
--- a/include/cufinufft/cudeconvolve.h
+++ b/include/cufinufft/cudeconvolve.h
@@ -5,29 +5,35 @@
 
 namespace cufinufft {
 namespace deconvolve {
-template <typename T, int modeord>
-__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1);
-template <typename T, int modeord>
-__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf2);
-template <typename T, int modeord>
-__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                              T *fwkerhalf1, T *fwkerhalf2);
-template <typename T, int modeord>
-__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                           T *fwkerhalf1, T *fwkerhalf2);
+template<typename T, int modeord>
+__global__ void deconvolve_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw,
+                              cuda_complex<T> *fk, T *fwkerhalf1);
+template<typename T, int modeord>
+__global__ void amplify_1d(int ms, int nf1, int fw_width, cuda_complex<T> *fw,
+                           cuda_complex<T> *fk, T *fwkerhalf2);
+template<typename T, int modeord>
+__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, int fw_width,
+                              cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
+                              T *fwkerhalf2);
+template<typename T, int modeord>
+__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, int fw_width,
+                           cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
+                           T *fwkerhalf2);
 
-template <typename T, int modeord>
-__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
-                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
-template <typename T, int modeord>
-__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, int fw_width, cuda_complex<T> *fw,
-                           cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
+template<typename T, int modeord>
+__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
+                              int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
+                              T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
+template<typename T, int modeord>
+__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
+                           int fw_width, cuda_complex<T> *fw, cuda_complex<T> *fk,
+                           T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3);
 
-template <typename T, int modeord>
+template<typename T, int modeord>
 int cudeconvolve1d(cufinufft_plan_t<T> *d_mem, int blksize);
-template <typename T, int modeord>
+template<typename T, int modeord>
 int cudeconvolve2d(cufinufft_plan_t<T> *d_mem, int blksize);
-template <typename T, int modeord>
+template<typename T, int modeord>
 int cudeconvolve3d(cufinufft_plan_t<T> *d_mem, int blksize);
 } // namespace deconvolve
 } // namespace cufinufft
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
index 6cdb84340..6b2a075ea 100644
--- a/include/cufinufft/defs.h
+++ b/include/cufinufft/defs.h
@@ -4,11 +4,12 @@
 #include <limits>
 
 // constants needed within common
-// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for common
+// upper bound on w, ie nspread, even when padded (see evaluate_kernel_vector); also for
+// common
 #define MAX_NSPREAD 16
 
 // max number of positive quadr nodes
-#define MAX_NQUAD 100
+#define MAX_NQUAD   100
 
 // FIXME: If cufft ever takes N > INT_MAX...
 constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
@@ -18,16 +19,16 @@ constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
 #ifdef _OPENMP
 #include <omp.h>
 // point to actual omp utils
-#define MY_OMP_GET_NUM_THREADS() omp_get_num_threads()
-#define MY_OMP_GET_MAX_THREADS() omp_get_max_threads()
-#define MY_OMP_GET_THREAD_NUM() omp_get_thread_num()
+#define MY_OMP_GET_NUM_THREADS()  omp_get_num_threads()
+#define MY_OMP_GET_MAX_THREADS()  omp_get_max_threads()
+#define MY_OMP_GET_THREAD_NUM()   omp_get_thread_num()
 #define MY_OMP_SET_NUM_THREADS(x) omp_set_num_threads(x)
-#define MY_OMP_SET_NESTED(x) omp_set_nested(x)
+#define MY_OMP_SET_NESTED(x)      omp_set_nested(x)
 #else
 // non-omp safe dummy versions of omp utils
 #define MY_OMP_GET_NUM_THREADS() 1
 #define MY_OMP_GET_MAX_THREADS() 1
-#define MY_OMP_GET_THREAD_NUM() 0
+#define MY_OMP_GET_THREAD_NUM()  0
 #define MY_OMP_SET_NUM_THREADS(x)
 #define MY_OMP_SET_NESTED(x)
 #endif
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index 34b969b46..3b8d3db2c 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -16,255 +16,269 @@
 #include <finufft_errors.h>
 
 // 1d
-template <typename T>
-int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 
 // 2d
-template <typename T>
-int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 
 // 3d
-template <typename T>
-int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan);
 
 static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) {
-    switch (dim) {
-    case 1: {
-        opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex;
-        opts->gpu_binsizey = 1;
-        opts->gpu_binsizez = 1;
-    } break;
+  switch (dim) {
+  case 1: {
+    opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex;
+    opts->gpu_binsizey = 1;
+    opts->gpu_binsizez = 1;
+  } break;
+  case 2: {
+    opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex;
+    opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey;
+    opts->gpu_binsizez = 1;
+  } break;
+  case 3: {
+    switch (opts->gpu_method) {
+    case 1:
     case 2: {
-        opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex;
-        opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey;
-        opts->gpu_binsizez = 1;
+      opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex;
+      opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey;
+      opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez;
     } break;
-    case 3: {
-        switch (opts->gpu_method) {
-        case 1:
-        case 2: {
-            opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex;
-            opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey;
-            opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez;
-        } break;
-        case 4: {
-            opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex;
-            opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey;
-            opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez;
-            opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex;
-            opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey;
-            opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez;
-        } break;
-        }
+    case 4: {
+      opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex;
+      opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey;
+      opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez;
+      opts->gpu_binsizex  = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex;
+      opts->gpu_binsizey  = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey;
+      opts->gpu_binsizez  = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez;
     } break;
     }
+  } break;
+  }
 }
 
-template <typename T>
+template<typename T>
 int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol,
                             cufinufft_plan_t<T> **d_plan_ptr, cufinufft_opts *opts) {
-    /*
-        "plan" stage (in single or double precision).
-            See ../docs/cppdoc.md for main user-facing documentation.
-            Note that *d_plan_ptr in the args list was called simply *plan there.
-            This is the remaining dev-facing doc:
-
-    This performs:
-            (0) creating a new plan struct (d_plan), a pointer to which is passed
-                back by writing that pointer into *d_plan_ptr.
-            (1) set up the spread option, d_plan.spopts.
-            (2) calculate the correction factor on cpu, copy the value from cpu to
-                gpu
-            (3) allocate gpu arrays with size determined by number of fourier modes
-                and method related options that had been set in d_plan.opts
-            (4) call cufftPlanMany and save the cufft plan inside cufinufft plan
-            Variables and arrays inside the plan struct are set and allocated.
-
-        Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
-    */
-    int ier;
-    cuDoubleComplex *d_a = nullptr; // fseries temp data
-    T *d_f = nullptr;               // fseries temp data
-
-    if (type < 1 || type > 2) {
-        fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
-        return FINUFFT_ERR_TYPE_NOTVALID;
-    }
-    if (ntransf < 1) {
-        fprintf(stderr, "[%s] Invalid ntransf (%d): should be at least 1.\n", __func__, ntransf);
-        return FINUFFT_ERR_NTRANS_NOTVALID;
-    }
-
-    // Mult-GPU support: set the CUDA Device ID:
-    const int device_id = opts == NULL ? 0 : opts->gpu_device_id;
-    cufinufft::utils::WithCudaDevice device_swapper(device_id);
-
-    /* allocate the plan structure, assign address to user pointer. */
-    cufinufft_plan_t<T> *d_plan = new cufinufft_plan_t<T>;
-    *d_plan_ptr = d_plan;
-    // Zero out your struct, (sets all pointers to NULL)
-    memset(d_plan, 0, sizeof(*d_plan));
-
-    /* If a user has not supplied their own options, assign defaults for them. */
-    if (opts == NULL) { // use default opts
-        cufinufft_default_opts(&(d_plan->opts));
-    } else {                  // or read from what's passed in
-        d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
-    }
-
-    auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
-
-    /* Automatically set GPU method. */
-    if (d_plan->opts.gpu_method == 0) {
-        /* For type 1, we default to method 2 (SM) since this is generally faster.
-         * However, in the special case of _double precision_ in _three dimensions_
-         * with more than _three digits of precision_, there is note enough shared
-         * memory for this to work. As a result, we will default to method 1 (GM) in
-         * this special case.
-         *
-         * For type 2, we always default to method 1 (GM). */
-        if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3))
-            d_plan->opts.gpu_method = 2;
-        else if (type == 1 && tol < 1e-3)
-            d_plan->opts.gpu_method = 1;
-        else if (type == 2)
-            d_plan->opts.gpu_method = 1;
-    }
-
-    /* Setup Spreader */
-    using namespace cufinufft::common;
-    // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
-    if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
-        delete *d_plan_ptr;
-        *d_plan_ptr = nullptr;
-        return ier;
-    }
-
-    d_plan->dim = dim;
-    d_plan->ms = nmodes[0];
-    d_plan->mt = nmodes[1];
-    d_plan->mu = nmodes[2];
-
-    cufinufft_setup_binsize(type, dim, &d_plan->opts);
-    CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
-    set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex);
+  /*
+      "plan" stage (in single or double precision).
+          See ../docs/cppdoc.md for main user-facing documentation.
+          Note that *d_plan_ptr in the args list was called simply *plan there.
+          This is the remaining dev-facing doc:
+
+  This performs:
+          (0) creating a new plan struct (d_plan), a pointer to which is passed
+              back by writing that pointer into *d_plan_ptr.
+          (1) set up the spread option, d_plan.spopts.
+          (2) calculate the correction factor on cpu, copy the value from cpu to
+              gpu
+          (3) allocate gpu arrays with size determined by number of fourier modes
+              and method related options that had been set in d_plan.opts
+          (4) call cufftPlanMany and save the cufft plan inside cufinufft plan
+          Variables and arrays inside the plan struct are set and allocated.
+
+      Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
+  */
+  int ier;
+  cuDoubleComplex *d_a = nullptr; // fseries temp data
+  T *d_f               = nullptr; // fseries temp data
+
+  if (type < 1 || type > 2) {
+    fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
+    return FINUFFT_ERR_TYPE_NOTVALID;
+  }
+  if (ntransf < 1) {
+    fprintf(stderr, "[%s] Invalid ntransf (%d): should be at least 1.\n", __func__,
+            ntransf);
+    return FINUFFT_ERR_NTRANS_NOTVALID;
+  }
+
+  // Mult-GPU support: set the CUDA Device ID:
+  const int device_id = opts == NULL ? 0 : opts->gpu_device_id;
+  cufinufft::utils::WithCudaDevice device_swapper(device_id);
+
+  /* allocate the plan structure, assign address to user pointer. */
+  cufinufft_plan_t<T> *d_plan = new cufinufft_plan_t<T>;
+  *d_plan_ptr                 = d_plan;
+  // Zero out your struct, (sets all pointers to NULL)
+  memset(d_plan, 0, sizeof(*d_plan));
+
+  /* If a user has not supplied their own options, assign defaults for them. */
+  if (opts == NULL) {     // use default opts
+    cufinufft_default_opts(&(d_plan->opts));
+  } else {                // or read from what's passed in
+    d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
+  }
+
+  auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream;
+
+  /* Automatically set GPU method. */
+  if (d_plan->opts.gpu_method == 0) {
+    /* For type 1, we default to method 2 (SM) since this is generally faster.
+     * However, in the special case of _double precision_ in _three dimensions_
+     * with more than _three digits of precision_, there is note enough shared
+     * memory for this to work. As a result, we will default to method 1 (GM) in
+     * this special case.
+     *
+     * For type 2, we always default to method 1 (GM). */
+    if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3))
+      d_plan->opts.gpu_method = 2;
+    else if (type == 1 && tol < 1e-3)
+      d_plan->opts.gpu_method = 1;
+    else if (type == 2)
+      d_plan->opts.gpu_method = 1;
+  }
+
+  /* Setup Spreader */
+  using namespace cufinufft::common;
+  // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
+  if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
+    delete *d_plan_ptr;
+    *d_plan_ptr = nullptr;
+    return ier;
+  }
+
+  d_plan->dim = dim;
+  d_plan->ms  = nmodes[0];
+  d_plan->mt  = nmodes[1];
+  d_plan->mu  = nmodes[2];
+
+  cufinufft_setup_binsize(type, dim, &d_plan->opts);
+  CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
+  set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
+                d_plan->opts.gpu_obinsizex);
+  if (dim > 1)
+    set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2,
+                  d_plan->opts.gpu_obinsizey);
+  if (dim > 2)
+    set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
+                  d_plan->opts.gpu_obinsizez);
+  int fftsign = (iflag >= 0) ? 1 : -1;
+
+  d_plan->nf1      = nf1;
+  d_plan->nf2      = nf2;
+  d_plan->nf3      = nf3;
+  d_plan->iflag    = fftsign;
+  d_plan->ntransf  = ntransf;
+  int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0;
+  if (maxbatchsize == 0)                 // implies: use a heuristic.
+    maxbatchsize = std::min(ntransf, 8); // heuristic from test codes
+  d_plan->maxbatchsize = maxbatchsize;
+  d_plan->type         = type;
+
+  if (d_plan->type == 1) d_plan->spopts.spread_direction = 1;
+  if (d_plan->type == 2) d_plan->spopts.spread_direction = 2;
+
+  using namespace cufinufft::memtransfer;
+  switch (d_plan->dim) {
+  case 1: {
+    if ((ier = allocgpumem1d_plan<T>(d_plan))) goto finalize;
+  } break;
+  case 2: {
+    if ((ier = allocgpumem2d_plan<T>(d_plan))) goto finalize;
+  } break;
+  case 3: {
+    if ((ier = allocgpumem3d_plan<T>(d_plan))) goto finalize;
+  } break;
+  }
+
+  cufftHandle fftplan;
+  cufftResult_t cufft_status;
+  switch (d_plan->dim) {
+  case 1: {
+    int n[]       = {(int)nf1};
+    int inembed[] = {(int)nf1};
+
+    cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
+                                 inembed[0], cufft_type<T>(), maxbatchsize);
+  } break;
+  case 2: {
+    int n[]       = {(int)nf2, (int)nf1};
+    int inembed[] = {(int)nf2, (int)nf1};
+
+    cufft_status =
+        cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
+                      inembed[0] * inembed[1], cufft_type<T>(), maxbatchsize);
+  } break;
+  case 3: {
+    int n[]       = {(int)nf3, (int)nf2, (int)nf1};
+    int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
+
+    cufft_status = cufftPlanMany(
+        &fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
+        inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), maxbatchsize);
+  } break;
+  }
+
+  if (cufft_status != CUFFT_SUCCESS) {
+    fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
+            cufftGetErrorString(cufft_status));
+    ier = FINUFFT_ERR_CUDA_FAILURE;
+    goto finalize;
+  }
+  cufftSetStream(fftplan, stream);
+
+  d_plan->fftplan = fftplan;
+  {
+    std::complex<double> *a = d_plan->fseries_precomp_a;
+    T *f                    = d_plan->fseries_precomp_f;
+
+    onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
     if (dim > 1)
-        set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2, d_plan->opts.gpu_obinsizey);
+      onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
     if (dim > 2)
-        set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, d_plan->opts.gpu_obinsizez);
-    int fftsign = (iflag >= 0) ? 1 : -1;
-
-    d_plan->nf1 = nf1;
-    d_plan->nf2 = nf2;
-    d_plan->nf3 = nf3;
-    d_plan->iflag = fftsign;
-    d_plan->ntransf = ntransf;
-    int maxbatchsize = opts ? opts->gpu_maxbatchsize : 0;
-    if (maxbatchsize == 0)                   // implies: use a heuristic.
-        maxbatchsize = std::min(ntransf, 8); // heuristic from test codes
-    d_plan->maxbatchsize = maxbatchsize;
-    d_plan->type = type;
-
-    if (d_plan->type == 1)
-        d_plan->spopts.spread_direction = 1;
-    if (d_plan->type == 2)
-        d_plan->spopts.spread_direction = 2;
-
-    using namespace cufinufft::memtransfer;
-    switch (d_plan->dim) {
-    case 1: {
-        if ((ier = allocgpumem1d_plan<T>(d_plan)))
-            goto finalize;
-    } break;
-    case 2: {
-        if ((ier = allocgpumem2d_plan<T>(d_plan)))
-            goto finalize;
-    } break;
-    case 3: {
-        if ((ier = allocgpumem3d_plan<T>(d_plan)))
-            goto finalize;
-    } break;
-    }
-
-    cufftHandle fftplan;
-    cufftResult_t cufft_status;
-    switch (d_plan->dim) {
-    case 1: {
-        int n[] = {(int)nf1};
-        int inembed[] = {(int)nf1};
-
-        cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, inembed[0], cufft_type<T>(),
-                                     maxbatchsize);
-    } break;
-    case 2: {
-        int n[] = {(int)nf2, (int)nf1};
-        int inembed[] = {(int)nf2, (int)nf1};
-
-        cufft_status = cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
-                                     inembed[0] * inembed[1], cufft_type<T>(), maxbatchsize);
-    } break;
-    case 3: {
-        int n[] = {(int)nf3, (int)nf2, (int)nf1};
-        int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
-
-        cufft_status = cufftPlanMany(&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
-                                     inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), maxbatchsize);
-    } break;
-    }
-
-    if (cufft_status != CUFFT_SUCCESS) {
-        fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, cufftGetErrorString(cufft_status));
-        ier = FINUFFT_ERR_CUDA_FAILURE;
-        goto finalize;
-    }
-    cufftSetStream(fftplan, stream);
-
-    d_plan->fftplan = fftplan;
-    {
-        std::complex<double> *a = d_plan->fseries_precomp_a;
-        T *f = d_plan->fseries_precomp_f;
-
-        onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
-        if (dim > 1)
-            onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
-        if (dim > 2)
-            onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, d_plan->spopts);
-
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(
-                 cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream))))
-            goto finalize;
-        if ((ier =
-                 checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice, stream))))
-            goto finalize;
-        if ((ier = cufserieskernelcompute(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
-                                          d_plan->fwkerhalf3, d_plan->spopts.nspread, stream)))
-            goto finalize;
-    }
+      onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD,
+                                    d_plan->spopts);
+
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_f, dim * MAX_NQUAD * sizeof(T), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMemcpyAsync(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
+                             cudaMemcpyHostToDevice, stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMemcpyAsync(d_f, f, dim * MAX_NQUAD * sizeof(T),
+                                               cudaMemcpyHostToDevice, stream))))
+      goto finalize;
+    if ((ier = cufserieskernelcompute(
+             d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
+             d_plan->fwkerhalf3, d_plan->spopts.nspread, stream)))
+      goto finalize;
+  }
 
 finalize:
-    cudaFreeAsync(d_a, stream);
-    cudaFreeAsync(d_f, stream);
+  cudaFreeAsync(d_a, stream);
+  cudaFreeAsync(d_f, stream);
 
-    if (ier > 1) {
-        delete *d_plan_ptr;
-        *d_plan_ptr = nullptr;
-    }
+  if (ier > 1) {
+    delete *d_plan_ptr;
+    *d_plan_ptr = nullptr;
+  }
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
-int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_t, T *d_u,
+                          cufinufft_plan_t<T> *d_plan)
 /*
     "setNUpts" stage (in single or double precision).
 
@@ -302,66 +316,78 @@ Notes: the type T means either single or double, matching the
     Melody Shih 07/25/19; Barnett 2/16/21 moved out docs.
 */
 {
-    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int dim = d_plan->dim;
-
-    d_plan->M = M;
-
-    using namespace cufinufft::memtransfer;
-    int ier;
-    switch (d_plan->dim) {
-    case 1: {
-        ier = allocgpumem1d_nupts<T>(d_plan);
-    } break;
-    case 2: {
-        ier = allocgpumem2d_nupts<T>(d_plan);
-    } break;
-    case 3: {
-        ier = allocgpumem3d_nupts<T>(d_plan);
-    } break;
-    }
-    if (ier)
-        return ier;
-
-    d_plan->kx = d_kx;
-    if (dim > 1)
-        d_plan->ky = d_ky;
-    if (dim > 2)
-        d_plan->kz = d_kz;
-
-    using namespace cufinufft::spreadinterp;
-    switch (d_plan->dim) {
-    case 1: {
-        if (d_plan->opts.gpu_method == 1 && (ier = cuspread1d_nuptsdriven_prop<T>(nf1, M, d_plan)))
-            fprintf(stderr, "error: cuspread1d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
-        if (d_plan->opts.gpu_method == 2 && (ier = cuspread1d_subprob_prop<T>(nf1, M, d_plan)))
-            fprintf(stderr, "error: cuspread1d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-    } break;
-    case 2: {
-        if (d_plan->opts.gpu_method == 1 && (ier = cuspread2d_nuptsdriven_prop<T>(nf1, nf2, M, d_plan)))
-            fprintf(stderr, "error: cuspread2d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
-        if (d_plan->opts.gpu_method == 2 && (ier = cuspread2d_subprob_prop<T>(nf1, nf2, M, d_plan)))
-            fprintf(stderr, "error: cuspread2d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-    } break;
-    case 3: {
-        if (d_plan->opts.gpu_method == 1 && (ier = cuspread3d_nuptsdriven_prop<T>(nf1, nf2, nf3, M, d_plan)))
-            fprintf(stderr, "error: cuspread3d_nuptsdriven_prop, method(%d)\n", d_plan->opts.gpu_method);
-        if (d_plan->opts.gpu_method == 2 && (ier = cuspread3d_subprob_prop<T>(nf1, nf2, nf3, M, d_plan)))
-            fprintf(stderr, "error: cuspread3d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-        if (d_plan->opts.gpu_method == 4 && (ier = cuspread3d_blockgather_prop<T>(nf1, nf2, nf3, M, d_plan)))
-            fprintf(stderr, "error: cuspread3d_blockgather_prop, method(%d)\n", d_plan->opts.gpu_method);
-    } break;
-    }
-
-    return ier;
+  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int nf3 = d_plan->nf3;
+  int dim = d_plan->dim;
+
+  d_plan->M = M;
+
+  using namespace cufinufft::memtransfer;
+  int ier;
+  switch (d_plan->dim) {
+  case 1: {
+    ier = allocgpumem1d_nupts<T>(d_plan);
+  } break;
+  case 2: {
+    ier = allocgpumem2d_nupts<T>(d_plan);
+  } break;
+  case 3: {
+    ier = allocgpumem3d_nupts<T>(d_plan);
+  } break;
+  }
+  if (ier) return ier;
+
+  d_plan->kx = d_kx;
+  if (dim > 1) d_plan->ky = d_ky;
+  if (dim > 2) d_plan->kz = d_kz;
+
+  using namespace cufinufft::spreadinterp;
+  switch (d_plan->dim) {
+  case 1: {
+    if (d_plan->opts.gpu_method == 1 &&
+        (ier = cuspread1d_nuptsdriven_prop<T>(nf1, M, d_plan)))
+      fprintf(stderr, "error: cuspread1d_nupts_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+    if (d_plan->opts.gpu_method == 2 &&
+        (ier = cuspread1d_subprob_prop<T>(nf1, M, d_plan)))
+      fprintf(stderr, "error: cuspread1d_subprob_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+  } break;
+  case 2: {
+    if (d_plan->opts.gpu_method == 1 &&
+        (ier = cuspread2d_nuptsdriven_prop<T>(nf1, nf2, M, d_plan)))
+      fprintf(stderr, "error: cuspread2d_nupts_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+    if (d_plan->opts.gpu_method == 2 &&
+        (ier = cuspread2d_subprob_prop<T>(nf1, nf2, M, d_plan)))
+      fprintf(stderr, "error: cuspread2d_subprob_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+  } break;
+  case 3: {
+    if (d_plan->opts.gpu_method == 1 &&
+        (ier = cuspread3d_nuptsdriven_prop<T>(nf1, nf2, nf3, M, d_plan)))
+      fprintf(stderr, "error: cuspread3d_nuptsdriven_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+    if (d_plan->opts.gpu_method == 2 &&
+        (ier = cuspread3d_subprob_prop<T>(nf1, nf2, nf3, M, d_plan)))
+      fprintf(stderr, "error: cuspread3d_subprob_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+    if (d_plan->opts.gpu_method == 4 &&
+        (ier = cuspread3d_blockgather_prop<T>(nf1, nf2, nf3, M, d_plan)))
+      fprintf(stderr, "error: cuspread3d_blockgather_prop, method(%d)\n",
+              d_plan->opts.gpu_method);
+  } break;
+  }
+
+  return ier;
 }
 
-template <typename T>
-int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                           cufinufft_plan_t<T> *d_plan)
 /*
     "exec" stage (single and double precision versions).
 
@@ -377,53 +403,47 @@ int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinuff
           Type 2; output for Type 1)
 
     Notes:
-        i) Here CUFINUFFT_CPX is a defined type meaning either complex<float> or complex<double>
-        to match the precision of the library called.
-        ii) All operations are done on the GPU device (hence the d_* names)
+        i) Here CUFINUFFT_CPX is a defined type meaning either complex<float> or
+   complex<double> to match the precision of the library called. ii) All operations are
+   done on the GPU device (hence the d_* names)
 
     Melody Shih 07/25/19; Barnett 2/16/21.
 */
 {
-    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    int ier;
-    int type = d_plan->type;
-    switch (d_plan->dim) {
-    case 1: {
-        if (type == 1)
-            ier = cufinufft1d1_exec<T>(d_c, d_fk, d_plan);
-        if (type == 2)
-            ier = cufinufft1d2_exec<T>(d_c, d_fk, d_plan);
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = FINUFFT_ERR_TYPE_NOTVALID;
-        }
-    } break;
-    case 2: {
-        if (type == 1)
-            ier = cufinufft2d1_exec<T>(d_c, d_fk, d_plan);
-        if (type == 2)
-            ier = cufinufft2d2_exec<T>(d_c, d_fk, d_plan);
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = FINUFFT_ERR_TYPE_NOTVALID;
-        }
-    } break;
-    case 3: {
-        if (type == 1)
-            ier = cufinufft3d1_exec<T>(d_c, d_fk, d_plan);
-        if (type == 2)
-            ier = cufinufft3d2_exec<T>(d_c, d_fk, d_plan);
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = FINUFFT_ERR_TYPE_NOTVALID;
-        }
-    } break;
+  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  int ier;
+  int type = d_plan->type;
+  switch (d_plan->dim) {
+  case 1: {
+    if (type == 1) ier = cufinufft1d1_exec<T>(d_c, d_fk, d_plan);
+    if (type == 2) ier = cufinufft1d2_exec<T>(d_c, d_fk, d_plan);
+    if (type == 3) {
+      std::cerr << "Not Implemented yet" << std::endl;
+      ier = FINUFFT_ERR_TYPE_NOTVALID;
     }
+  } break;
+  case 2: {
+    if (type == 1) ier = cufinufft2d1_exec<T>(d_c, d_fk, d_plan);
+    if (type == 2) ier = cufinufft2d2_exec<T>(d_c, d_fk, d_plan);
+    if (type == 3) {
+      std::cerr << "Not Implemented yet" << std::endl;
+      ier = FINUFFT_ERR_TYPE_NOTVALID;
+    }
+  } break;
+  case 3: {
+    if (type == 1) ier = cufinufft3d1_exec<T>(d_c, d_fk, d_plan);
+    if (type == 2) ier = cufinufft3d2_exec<T>(d_c, d_fk, d_plan);
+    if (type == 3) {
+      std::cerr << "Not Implemented yet" << std::endl;
+      ier = FINUFFT_ERR_TYPE_NOTVALID;
+    }
+  } break;
+  }
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int cufinufft_destroy_impl(cufinufft_plan_t<T> *d_plan)
 /*
     "destroy" stage (single and double precision versions).
@@ -435,21 +455,19 @@ int cufinufft_destroy_impl(cufinufft_plan_t<T> *d_plan)
         Also see ../docs/cppdoc.md for main user-facing documentation.
 */
 {
-    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
 
-    // Can't destroy a null pointer.
-    if (!d_plan)
-        return FINUFFT_ERR_PLAN_NOTVALID;
+  // Can't destroy a null pointer.
+  if (!d_plan) return FINUFFT_ERR_PLAN_NOTVALID;
 
-    using namespace cufinufft::memtransfer;
-    freegpumemory<T>(d_plan);
+  using namespace cufinufft::memtransfer;
+  freegpumemory<T>(d_plan);
 
-    if (d_plan->fftplan)
-        cufftDestroy(d_plan->fftplan);
+  if (d_plan->fftplan) cufftDestroy(d_plan->fftplan);
 
-    /* free/destruct the plan */
-    delete d_plan;
+  /* free/destruct the plan */
+  delete d_plan;
 
-    return 0;
+  return 0;
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/memtransfer.h b/include/cufinufft/memtransfer.h
index 382f911e9..4c4788b9d 100644
--- a/include/cufinufft/memtransfer.h
+++ b/include/cufinufft/memtransfer.h
@@ -6,20 +6,13 @@
 namespace cufinufft {
 namespace memtransfer {
 
-template <typename T>
-int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-void freegpumemory(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan);
+template<typename T> void freegpumemory(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan);
+template<typename T> int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan);
 
 } // namespace memtransfer
 } // namespace cufinufft
diff --git a/include/cufinufft/precision_independent.h b/include/cufinufft/precision_independent.h
index ff98506bf..9fa48a07e 100644
--- a/include/cufinufft/precision_independent.h
+++ b/include/cufinufft/precision_independent.h
@@ -6,8 +6,8 @@
 #define PRECISION_INDEPENDENT_H
 
 #include <cuComplex.h>
-#define rpart(x) (cuCreal(x))
-#define ipart(x) (cuCimag(x))
+#define rpart(x)    (cuCreal(x))
+#define ipart(x)    (cuCimag(x))
 #define cmplx(x, y) (make_cuDoubleComplex(x, y))
 namespace cufinufft {
 namespace common {
@@ -20,42 +20,51 @@ __device__ RT cabs(const CT &z);
 __device__ CT cpow(const CT &z, const int &n);
 
 /* Common Kernels from spreadinterp3d */
-__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony, int onz, int bnx, int bny,
-                                          int bnz);
-__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny, int nbinz);
+__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony,
+                                          int onz, int bnx, int bny, int bnz);
+__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny,
+                                    int nbinz);
 
 /* spreadinterp 1d */
-__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins);
+__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                int numbins);
 
-__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins);
+__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                      int *d_numsubprob, int numbins);
 
 __global__ void trivial_global_sort_index_1d(int M, int *index);
 
 /* spreadinterp 2d */
-__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins);
+__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                int numbins);
 
-__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins);
+__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                      int *d_numsubprob, int numbins);
 
 __global__ void trivial_global_sort_index_2d(int M, int *index);
 
 /* spreadinterp3d */
-__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins);
+__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                   int numbins);
 
-__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins);
+__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                         int *d_numsubprob, int numbins);
 
-__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size,
-                                   int *num_subprob, int maxsubprobsize, int numbins);
+__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz,
+                                   int *bin_size, int *num_subprob, int maxsubprobsize,
+                                   int numbins);
 
-__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, int *d_numsubprob,
-                                         int numbins);
+__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts,
+                                         int *d_numsubprob, int numbins);
 
 __global__ void trivial_global_sort_index_3d(int M, int *index);
 
-__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny,
-                                int nobinz, int *binsize);
+__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz,
+                                int nobinx, int nobiny, int nobinz, int *binsize);
 
-__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny,
-                                    int nobinz, int *binsize, int *index, int *binstartpts, int M);
+__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz,
+                                    int nobinx, int nobiny, int nobinz, int *binsize,
+                                    int *index, int *binstartpts, int M);
 } // namespace common
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
index 85850e92a..da1c59930 100644
--- a/include/cufinufft/spreadinterp.h
+++ b/include/cufinufft/spreadinterp.h
@@ -1,21 +1,20 @@
 #ifndef __CUSPREADINTERP_H__
 #define __CUSPREADINTERP_H__
 
+#include <cmath>
 #include <cufinufft/types.h>
 #include <finufft_spread_opts.h>
-#include <cmath>
 
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
-static __forceinline__ __device__ T fold_rescale(T x, int N) {
+template<typename T> static __forceinline__ __device__ T fold_rescale(T x, int N) {
   static constexpr const auto x2pi = T(0.159154943091895345554011992339482617);
-  const T result = x * x2pi + T(0.5);
-  return (result-floor(result)) * T(N);
+  const T result                   = x * x2pi + T(0.5);
+  return (result - floor(result)) * T(N);
 }
 
-template <typename T>
+template<typename T>
 static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
 /* ES ("exp sqrt") kernel evaluation at single real argument:
       phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)),    for |x| < nspread/2
@@ -23,17 +22,17 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
    approximation to prolate spheroidal wavefunction (PSWF) of order 0.
    This is the "reference implementation", used by eg common/onedim_* 2/17/17 */
 {
-    if (abs(x) >= opts.ES_halfwidth)
-        // if spreading/FT careful, shouldn't need this if, but causes no speed hit
-        return 0.0;
-    else
-        return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x));
+  if (abs(x) >= opts.ES_halfwidth)
+    // if spreading/FT careful, shouldn't need this if, but causes no speed hit
+    return 0.0;
+  else
+    return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x));
 }
 
-template <typename T>
+template<typename T>
 int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth);
 
-template <typename T>
+template<typename T>
 static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns)
 /* ES ("exp sqrt") kernel evaluation at single real argument:
    phi(x) = exp(beta.sqrt(1 - (2x/n_s)^2)),    for |x| < nspread/2
@@ -42,89 +41,95 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int
    This is the "reference implementation", used by eg common/onedim_*
     2/17/17 */
 {
-    return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0;
+  return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0;
 }
 
-template <typename T>
-static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w, const double upsampfac)
+template<typename T>
+static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, const int w,
+                                                         const double upsampfac)
 /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
    x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
    This is the current evaluation method, since it's faster (except i7 w=16).
    Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
 {
-    T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1]
-    // insert the auto-generated code which expects z, w args, writes to ker...
-    if (upsampfac == 2.0) { // floating point equality is fine here
-        using FLT = T;
-        using CUFINUFFT_FLT = T;
+  T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1]
+  // insert the auto-generated code which expects z, w args, writes to ker...
+  if (upsampfac == 2.0) { // floating point equality is fine here
+    using FLT           = T;
+    using CUFINUFFT_FLT = T;
 #include "cufinufft/contrib/ker_horner_allw_loop.inc"
-    }
+  }
 }
 
-template <typename T>
-static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w, const T es_c, const T es_beta) {
-    for (int i = 0; i < w; i++) {
-        ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
-    }
+template<typename T>
+static __inline__ __device__ void eval_kernel_vec(T *ker, const T x, const int w,
+                                                  const T es_c, const T es_beta) {
+  for (int i = 0; i < w; i++) {
+    ker[i] = evaluate_kernel(abs(x + i), es_c, es_beta, w);
+  }
 }
 
 // Functions for calling different methods of spreading & interpolation
-template <typename T>
-int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize);
 
-template <typename T>
-int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T> int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize);
 
 // Wrappers for methods of spreading
-template <typename T>
+template<typename T>
 int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
+template<typename T>
 int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
+template<typename T>
 int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
+template<typename T>
 int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
 
-template <typename T>
+template<typename T>
 int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
+template<typename T>
+int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
 int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
+template<typename T>
 int cuspread2d_subprob(int nf1, int nf2, int m, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan);
-template <typename T>
-int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T>
+int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
+                                cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
+int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
+                                cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
+int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
+                            cufinufft_plan_t<T> *d_plan);
+template<typename T>
+int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize);
 
 // Wrappers for methods of interpolation
-template <typename T>
+template<typename T>
 int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
+template<typename T>
+int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
 int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
-template <typename T>
-int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize);
+template<typename T>
+int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize);
+template<typename T>
+int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
index 246b4aaa1..05b2f6c36 100644
--- a/include/cufinufft/types.h
+++ b/include/cufinufft/types.h
@@ -3,99 +3,88 @@
 
 #include <cufft.h>
 
+#include <cufinufft/defs.h>
 #include <cufinufft_opts.h>
 #include <finufft_spread_opts.h>
 #include <type_traits>
-#include <cufinufft/defs.h>
 
 #include <complex.h>
 
 #define CUFINUFFT_BIGINT int
 
 // Ugly trick to map a template to a fixed type, here cuda_complex<T>
-template <typename T>
-struct cuda_complex_impl;
-template <>
-struct cuda_complex_impl<float> {
-    using type = cuFloatComplex;
+template<typename T> struct cuda_complex_impl;
+template<> struct cuda_complex_impl<float> {
+  using type = cuFloatComplex;
 };
-template <>
-struct cuda_complex_impl<double> {
-    using type = cuDoubleComplex;
+template<> struct cuda_complex_impl<double> {
+  using type = cuDoubleComplex;
 };
 
-template <typename T>
-using cuda_complex = typename cuda_complex_impl<T>::type;
-
-template <typename T>
-struct cufinufft_plan_t {
-    cufinufft_opts opts;
-    finufft_spread_opts spopts;
-
-    int type;
-    int dim;
-    CUFINUFFT_BIGINT M;
-    CUFINUFFT_BIGINT nf1;
-    CUFINUFFT_BIGINT nf2;
-    CUFINUFFT_BIGINT nf3;
-    CUFINUFFT_BIGINT ms;
-    CUFINUFFT_BIGINT mt;
-    CUFINUFFT_BIGINT mu;
-    int ntransf;
-    int maxbatchsize;
-    int iflag;
-
-    int totalnumsubprob;
-    T *fwkerhalf1;
-    T *fwkerhalf2;
-    T *fwkerhalf3;
-
-    T *kx;
-    T *ky;
-    T *kz;
-    cuda_complex<T> *c;
-    cuda_complex<T> *fw;
-    cuda_complex<T> *fk;
-
-    // Arrays that used in subprob method
-    int *idxnupts;        // length: #nupts, index of the nupts in the bin-sorted order
-    int *sortidx;         // length: #nupts, order inside the bin the nupt belongs to
-    int *numsubprob;      // length: #bins,  number of subproblems in each bin
-    int *binsize;         // length: #bins, number of nonuniform ponits in each bin
-    int *binstartpts;     // length: #bins, exclusive scan of array binsize
-    int *subprob_to_bin;  // length: #subproblems, the bin the subproblem works on
-    int *subprobstartpts; // length: #bins, exclusive scan of array numsubprob
-
-    // Arrays for 3d (need to sort out)
-    int *numnupts;
-    int *subprob_to_nupts;
-
-    // Temporary variables to do fseries precomputation
-    std::complex<double> fseries_precomp_a[3 * MAX_NQUAD];
-    T fseries_precomp_f[3 * MAX_NQUAD];
-
-    cufftHandle fftplan;
-    cudaStream_t stream;
+template<typename T> using cuda_complex = typename cuda_complex_impl<T>::type;
+
+template<typename T> struct cufinufft_plan_t {
+  cufinufft_opts opts;
+  finufft_spread_opts spopts;
+
+  int type;
+  int dim;
+  CUFINUFFT_BIGINT M;
+  CUFINUFFT_BIGINT nf1;
+  CUFINUFFT_BIGINT nf2;
+  CUFINUFFT_BIGINT nf3;
+  CUFINUFFT_BIGINT ms;
+  CUFINUFFT_BIGINT mt;
+  CUFINUFFT_BIGINT mu;
+  int ntransf;
+  int maxbatchsize;
+  int iflag;
+
+  int totalnumsubprob;
+  T *fwkerhalf1;
+  T *fwkerhalf2;
+  T *fwkerhalf3;
+
+  T *kx;
+  T *ky;
+  T *kz;
+  cuda_complex<T> *c;
+  cuda_complex<T> *fw;
+  cuda_complex<T> *fk;
+
+  // Arrays that used in subprob method
+  int *idxnupts;        // length: #nupts, index of the nupts in the bin-sorted order
+  int *sortidx;         // length: #nupts, order inside the bin the nupt belongs to
+  int *numsubprob;      // length: #bins,  number of subproblems in each bin
+  int *binsize;         // length: #bins, number of nonuniform ponits in each bin
+  int *binstartpts;     // length: #bins, exclusive scan of array binsize
+  int *subprob_to_bin;  // length: #subproblems, the bin the subproblem works on
+  int *subprobstartpts; // length: #bins, exclusive scan of array numsubprob
+
+  // Arrays for 3d (need to sort out)
+  int *numnupts;
+  int *subprob_to_nupts;
+
+  // Temporary variables to do fseries precomputation
+  std::complex<double> fseries_precomp_a[3 * MAX_NQUAD];
+  T fseries_precomp_f[3 * MAX_NQUAD];
+
+  cufftHandle fftplan;
+  cudaStream_t stream;
 };
 
-template <typename T>
-static cufftType_t cufft_type();
-template <>
-inline cufftType_t cufft_type<float>() {
-    return CUFFT_C2C;
-}
+template<typename T> static cufftType_t cufft_type();
+template<> inline cufftType_t cufft_type<float>() { return CUFFT_C2C; }
 
-template <>
-inline cufftType_t cufft_type<double>() {
-    return CUFFT_Z2Z;
-}
+template<> inline cufftType_t cufft_type<double>() { return CUFFT_Z2Z; }
 
-static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata, cufftComplex *odata, int direction) {
-    return cufftExecC2C(plan, idata, odata, direction);
+static inline cufftResult cufft_ex(cufftHandle plan, cufftComplex *idata,
+                                   cufftComplex *odata, int direction) {
+  return cufftExecC2C(plan, idata, odata, direction);
 }
-static inline cufftResult cufft_ex(cufftHandle plan, cufftDoubleComplex *idata, cufftDoubleComplex *odata,
-                                   int direction) {
-    return cufftExecZ2Z(plan, idata, odata, direction);
+static inline cufftResult cufft_ex(cufftHandle plan, cufftDoubleComplex *idata,
+                                   cufftDoubleComplex *odata, int direction) {
+  return cufftExecZ2Z(plan, idata, odata, direction);
 }
 
 #endif
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index e8deb42e9..3455b99c0 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -15,59 +15,58 @@
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 #else
 __inline__ __device__ double atomicAdd(double *address, double val) {
-    unsigned long long int *address_as_ull = (unsigned long long int *)address;
-    unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+  unsigned long long int old             = *address_as_ull, assumed;
 
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+  do {
+    assumed = old;
+    old     = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val + __longlong_as_double(assumed)));
 
-        // Note: uses integer comparison to avoid hang in case of NaN
-        // (since NaN != NaN)
-    } while (assumed != old);
+    // Note: uses integer comparison to avoid hang in case of NaN
+    // (since NaN != NaN)
+  } while (assumed != old);
 
-    return __longlong_as_double(old);
+  return __longlong_as_double(old);
 }
 #endif
 
 namespace cufinufft {
 namespace utils {
 class WithCudaDevice {
-  public:
-    WithCudaDevice(int device) {
-        cudaGetDevice(&orig_device_);
-        cudaSetDevice(device);
-    }
+public:
+  WithCudaDevice(int device) {
+    cudaGetDevice(&orig_device_);
+    cudaSetDevice(device);
+  }
 
-    ~WithCudaDevice() { cudaSetDevice(orig_device_); }
+  ~WithCudaDevice() { cudaSetDevice(orig_device_); }
 
-  private:
-    int orig_device_;
+private:
+  int orig_device_;
 };
 
 // jfm timer class
 class CNTime {
-  public:
-    void start();
-    double restart();
-    double elapsedsec();
+public:
+  void start();
+  double restart();
+  double elapsedsec();
 
-  private:
-    struct timeval initial;
+private:
+  struct timeval initial;
 };
 
 // ahb math helpers
 CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b);
 
-template <typename T>
-T infnorm(int n, std::complex<T> *a) {
-    T nrm = 0.0;
-    for (int m = 0; m < n; ++m) {
-        T aa = real(conj(a[m]) * a[m]);
-        if (aa > nrm)
-            nrm = aa;
-    }
-    return sqrt(nrm);
+template<typename T> T infnorm(int n, std::complex<T> *a) {
+  T nrm = 0.0;
+  for (int m = 0; m < n; ++m) {
+    T aa = real(conj(a[m]) * a[m]);
+    if (aa > nrm) nrm = aa;
+  }
+  return sqrt(nrm);
 }
 } // namespace utils
 } // namespace cufinufft
diff --git a/include/cufinufft_opts.h b/include/cufinufft_opts.h
index a3da46f0d..c9898f3b7 100644
--- a/include/cufinufft_opts.h
+++ b/include/cufinufft_opts.h
@@ -2,33 +2,33 @@
 #define __CUFINUFFT_OPTS_H__
 
 typedef struct cufinufft_opts { // see cufinufft_default_opts() for defaults
-    double upsampfac;           // upsampling ratio sigma, only 2.0 (standard) is implemented
-                                /* following options are for gpu */
-    int gpu_method;             // 1: nonuniform-pts driven, 2: shared mem (SM)
-    int gpu_sort;               // when NU-pts driven: 0: no sort (GM), 1: sort (GM-sort)
+  double upsampfac; // upsampling ratio sigma, only 2.0 (standard) is implemented
+                    /* following options are for gpu */
+  int gpu_method;   // 1: nonuniform-pts driven, 2: shared mem (SM)
+  int gpu_sort;     // when NU-pts driven: 0: no sort (GM), 1: sort (GM-sort)
 
-    int gpu_binsizex; // used for 2D, 3D subproblem method
-    int gpu_binsizey;
-    int gpu_binsizez;
+  int gpu_binsizex; // used for 2D, 3D subproblem method
+  int gpu_binsizey;
+  int gpu_binsizez;
 
-    int gpu_obinsizex; // used for 3D spread block gather method
-    int gpu_obinsizey;
-    int gpu_obinsizez;
+  int gpu_obinsizex; // used for 3D spread block gather method
+  int gpu_obinsizey;
+  int gpu_obinsizez;
 
-    int gpu_maxsubprobsize;
-    int gpu_kerevalmeth; // 0: direct exp(sqrt()), 1: Horner ppval
+  int gpu_maxsubprobsize;
+  int gpu_kerevalmeth;      // 0: direct exp(sqrt()), 1: Horner ppval
 
-    int gpu_spreadinterponly; // 0: NUFFT, 1: spread or interpolation only
+  int gpu_spreadinterponly; // 0: NUFFT, 1: spread or interpolation only
 
-    int gpu_maxbatchsize;
+  int gpu_maxbatchsize;
 
-    /* multi-gpu support */
-    int gpu_device_id;
+  /* multi-gpu support */
+  int gpu_device_id;
 
-    void *gpu_stream;
+  void *gpu_stream;
 
-    int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
-                 //                  1 FFT-style mode order
+  int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
+               //                  1 FFT-style mode order
 } cufinufft_opts;
 
 #endif
diff --git a/include/finufft.h b/include/finufft.h
index 71a38f9be..487a3eb4f 100644
--- a/include/finufft.h
+++ b/include/finufft.h
@@ -5,7 +5,6 @@
 // They will clobber any prior macros starting FINUFFT*, so in the lib/test
 // sources finufft.h must be included before defs.h
 
-
 /* Devnotes.
    A) Two precisions done by including the "either precision" headers twice.
    No use of the private headers for lib/test/example compilation is made.
@@ -37,7 +36,7 @@
 #define FINUFFT_BIGINT int64_t
 
 #ifndef __cplusplus
-#include <stdbool.h>     // for bool type in C (needed for item in plan struct)
+#include <stdbool.h> // for bool type in C (needed for item in plan struct)
 #endif
 
 // this macro name has to be safe since exposed to user
@@ -50,4 +49,4 @@
 // clean up any purely local defs that are not in finufft_eitherprec.h...
 #undef FINUFFT_BIGINT
 
-#endif  // FINUFFT_H
+#endif // FINUFFT_H
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index 8780ff914..c2a5c48f7 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -19,7 +19,6 @@
 // (and clobbers FINUFFT* macros; watch out!)
 #include <finufft.h>
 
-
 // --------------- Private data types for compilation in either prec ---------
 // Devnote: must match those in relevant prec of public finufft.h interface!
 
@@ -28,13 +27,13 @@
 #define BIGINT int64_t
 // Precision-independent real and complex types, for private lib/test compile
 #ifdef SINGLE
-  #define FLT float
+#define FLT float
 #else
-  #define FLT double
+#define FLT double
 #endif
 // next line possibly obsolete...
 #define _USE_MATH_DEFINES
-#include <complex>          // we define C++ complex type only
+#include <complex> // we define C++ complex type only
 #define CPX std::complex<FLT>
 
 // inline macro, to force inlining of small functions
@@ -50,88 +49,84 @@
 // ------------- Library-wide algorithm parameter settings ----------------
 
 // Library version (is a string)
-#define FINUFFT_VER "2.2.0"
+#define FINUFFT_VER          "2.2.0"
 
 // Largest possible kernel spread width per dimension, in fine grid points
 // (used only in spreadinterp.cpp)
-#define MAX_NSPREAD 16
+#define MAX_NSPREAD          16
 
 // Fraction growth cut-off in utils:arraywidcen, sets when translate in type-3
 #define ARRAYWIDCEN_GROWFRAC 0.1
 
 // Max number of positive quadr nodes for kernel FT (used only in common.cpp)
-#define MAX_NQUAD 100
+#define MAX_NQUAD            100
 
 // Internal (nf1 etc) array allocation size that immediately raises error.
 // (Note: next235 takes 1s for 1e11, so it is also to prevent hang here.)
 // Increase this if you need >10TB (!) RAM...
-#define MAX_NF    (BIGINT)1e12
+#define MAX_NF               (BIGINT)1e12
 
 // Maximum allowed number M of NU points; useful to catch incorrectly cast int32
 // values for M = nj (also nk in type 3)...
-#define MAX_NU_PTS    (BIGINT)1e14
-
+#define MAX_NU_PTS           (BIGINT)1e14
 
 // -------------- Math consts (not in math.h) and useful math macros ----------
 #include <math.h>
 
 // either-precision unit imaginary number...
-#define IMA (CPX(0.0,1.0))
+#define IMA (CPX(0.0, 1.0))
 // using namespace std::complex_literals;  // needs C++14, provides 1i, 1if
-#ifndef M_PI                     // Windows apparently doesn't have this const
-  #define M_PI    3.14159265358979329
+#ifndef M_PI // Windows apparently doesn't have this const
+#define M_PI 3.14159265358979329
 #endif
 #define M_1_2PI 0.159154943091895336
 #define M_2PI   6.28318530717958648
 // to avoid mixed precision operators in eg i*pi, an either-prec PI...
-#define PI (FLT)M_PI
+#define PI      (FLT) M_PI
 
 // machine epsilon for decisions of achievable tolerance...
 #ifdef SINGLE
-  #define EPSILON (float)6e-08
+#define EPSILON (float)6e-08
 #else
-  #define EPSILON (double)1.1e-16
+#define EPSILON (double)1.1e-16
 #endif
 
-
 // Random numbers: crappy unif random number generator in [0,1).
 // These macros should probably be replaced by modern C++ std lib or random123.
 // (RAND_MAX is in stdlib.h)
 #include <stdlib.h>
-//#define rand01() (((FLT)(rand()%RAND_MAX))/RAND_MAX)
-#define rand01() ((FLT)rand()/(FLT)RAND_MAX)
+// #define rand01() (((FLT)(rand()%RAND_MAX))/RAND_MAX)
+#define rand01()     ((FLT)rand() / (FLT)RAND_MAX)
 // unif[-1,1]:
-#define randm11() (2*rand01() - (FLT)1.0)
+#define randm11()    (2 * rand01() - (FLT)1.0)
 // complex unif[-1,1] for Re and Im:
-#define crandm11() (randm11() + IMA*randm11())
+#define crandm11()   (randm11() + IMA * randm11())
 
 // Thread-safe seed-carrying versions of above (x is ptr to seed)...
-#define rand01r(x) ((FLT)rand_r(x)/(FLT)RAND_MAX)
+#define rand01r(x)   ((FLT)rand_r(x) / (FLT)RAND_MAX)
 // unif[-1,1]:
-#define randm11r(x) (2*rand01r(x) - (FLT)1.0)
+#define randm11r(x)  (2 * rand01r(x) - (FLT)1.0)
 // complex unif[-1,1] for Re and Im:
-#define crandm11r(x) (randm11r(x) + IMA*randm11r(x))
-
+#define crandm11r(x) (randm11r(x) + IMA * randm11r(x))
 
 // ----- OpenMP macros which also work when omp not present -----
 // Allows compile-time switch off of openmp, so compilation without any openmp
 // is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
 #ifdef _OPENMP
-  #include <omp.h>
-  // point to actual omp utils
-  #define MY_OMP_GET_NUM_THREADS() omp_get_num_threads()
-  #define MY_OMP_GET_MAX_THREADS() omp_get_max_threads()
-  #define MY_OMP_GET_THREAD_NUM() omp_get_thread_num()
-  #define MY_OMP_SET_NUM_THREADS(x) omp_set_num_threads(x)
+#include <omp.h>
+// point to actual omp utils
+#define MY_OMP_GET_NUM_THREADS()  omp_get_num_threads()
+#define MY_OMP_GET_MAX_THREADS()  omp_get_max_threads()
+#define MY_OMP_GET_THREAD_NUM()   omp_get_thread_num()
+#define MY_OMP_SET_NUM_THREADS(x) omp_set_num_threads(x)
 #else
-  // non-omp safe dummy versions of omp utils...
-  #define MY_OMP_GET_NUM_THREADS() 1
-  #define MY_OMP_GET_MAX_THREADS() 1
-  #define MY_OMP_GET_THREAD_NUM() 0
-  #define MY_OMP_SET_NUM_THREADS(x)
+// non-omp safe dummy versions of omp utils...
+#define MY_OMP_GET_NUM_THREADS() 1
+#define MY_OMP_GET_MAX_THREADS() 1
+#define MY_OMP_GET_THREAD_NUM()  0
+#define MY_OMP_SET_NUM_THREADS(x)
 #endif
 
-
 // Prec-switching name macros (respond to SINGLE), used in lib & test sources
 // and the plan object below.
 // Note: crucially, these are now indep of macros used to gen public finufft.h!
@@ -145,103 +140,101 @@
 #else
 #define FINUFFTIFY_UNSAFE(x) finufft##x
 #endif
-#define FINUFFTIFY(x) FINUFFTIFY_UNSAFE(x)
+#define FINUFFTIFY(x)        FINUFFTIFY_UNSAFE(x)
 // Now use the above tool to set up 2020-style macros used in tester source...
-#define FINUFFT_PLAN FINUFFTIFY(_plan)
-#define FINUFFT_PLAN_S FINUFFTIFY(_plan_s)
+#define FINUFFT_PLAN         FINUFFTIFY(_plan)
+#define FINUFFT_PLAN_S       FINUFFTIFY(_plan_s)
 #define FINUFFT_DEFAULT_OPTS FINUFFTIFY(_default_opts)
-#define FINUFFT_MAKEPLAN FINUFFTIFY(_makeplan)
-#define FINUFFT_SETPTS FINUFFTIFY(_setpts)
-#define FINUFFT_EXECUTE FINUFFTIFY(_execute)
-#define FINUFFT_DESTROY FINUFFTIFY(_destroy)
-#define FINUFFT1D1 FINUFFTIFY(1d1)
-#define FINUFFT1D2 FINUFFTIFY(1d2)
-#define FINUFFT1D3 FINUFFTIFY(1d3)
-#define FINUFFT2D1 FINUFFTIFY(2d1)
-#define FINUFFT2D2 FINUFFTIFY(2d2)
-#define FINUFFT2D3 FINUFFTIFY(2d3)
-#define FINUFFT3D1 FINUFFTIFY(3d1)
-#define FINUFFT3D2 FINUFFTIFY(3d2)
-#define FINUFFT3D3 FINUFFTIFY(3d3)
-#define FINUFFT1D1MANY FINUFFTIFY(1d1many)
-#define FINUFFT1D2MANY FINUFFTIFY(1d2many)
-#define FINUFFT1D3MANY FINUFFTIFY(1d3many)
-#define FINUFFT2D1MANY FINUFFTIFY(2d1many)
-#define FINUFFT2D2MANY FINUFFTIFY(2d2many)
-#define FINUFFT2D3MANY FINUFFTIFY(2d3many)
-#define FINUFFT3D1MANY FINUFFTIFY(3d1many)
-#define FINUFFT3D2MANY FINUFFTIFY(3d2many)
-#define FINUFFT3D3MANY FINUFFTIFY(3d3many)
-
+#define FINUFFT_MAKEPLAN     FINUFFTIFY(_makeplan)
+#define FINUFFT_SETPTS       FINUFFTIFY(_setpts)
+#define FINUFFT_EXECUTE      FINUFFTIFY(_execute)
+#define FINUFFT_DESTROY      FINUFFTIFY(_destroy)
+#define FINUFFT1D1           FINUFFTIFY(1d1)
+#define FINUFFT1D2           FINUFFTIFY(1d2)
+#define FINUFFT1D3           FINUFFTIFY(1d3)
+#define FINUFFT2D1           FINUFFTIFY(2d1)
+#define FINUFFT2D2           FINUFFTIFY(2d2)
+#define FINUFFT2D3           FINUFFTIFY(2d3)
+#define FINUFFT3D1           FINUFFTIFY(3d1)
+#define FINUFFT3D2           FINUFFTIFY(3d2)
+#define FINUFFT3D3           FINUFFTIFY(3d3)
+#define FINUFFT1D1MANY       FINUFFTIFY(1d1many)
+#define FINUFFT1D2MANY       FINUFFTIFY(1d2many)
+#define FINUFFT1D3MANY       FINUFFTIFY(1d3many)
+#define FINUFFT2D1MANY       FINUFFTIFY(2d1many)
+#define FINUFFT2D2MANY       FINUFFTIFY(2d2many)
+#define FINUFFT2D3MANY       FINUFFTIFY(2d3many)
+#define FINUFFT3D1MANY       FINUFFTIFY(3d1many)
+#define FINUFFT3D2MANY       FINUFFTIFY(3d2many)
+#define FINUFFT3D3MANY       FINUFFTIFY(3d3many)
 
 // --------  FINUFFT's plan object, prec-switching version ------------------
 // NB: now private (the public C++ or C etc user sees an opaque pointer to it)
 
 // FFTW is needed since we include a FFTW plan in the FINUFFT plan...
-#include <finufft/fftw_defs.h>          // (must come after complex.h)
+#include <finufft/fftw_defs.h> // (must come after complex.h)
 // (other FFT lib headers eg MKL could be here...)
 
 // group together a bunch of type 3 rescaling/centering/phasing parameters:
 #define TYPE3PARAMS FINUFFTIFY(_type3Params)
 typedef struct {
-  FLT X1,C1,D1,h1,gam1;  // x dim: X=halfwid C=center D=freqcen h,gam=rescale
-  FLT X2,C2,D2,h2,gam2;  // y
-  FLT X3,C3,D3,h3,gam3;  // z
+  FLT X1, C1, D1, h1, gam1; // x dim: X=halfwid C=center D=freqcen h,gam=rescale
+  FLT X2, C2, D2, h2, gam2; // y
+  FLT X3, C3, D3, h3, gam3; // z
 } TYPE3PARAMS;
 
-typedef struct FINUFFT_PLAN_S {  // the main plan object, fully C++
-  
-  int type;        // transform type (Rokhlin naming): 1,2 or 3
-  int dim;         // overall dimension: 1,2 or 3
-  int ntrans;      // how many transforms to do at once (vector or "many" mode)
-  BIGINT nj;       // num of NU pts in type 1,2 (for type 3, num input x pts)
-  BIGINT nk;       // number of NU freq pts (type 3 only)
-  FLT tol;         // relative user tolerance
-  int batchSize;   // # strength vectors to group together for FFTW, etc
-  int nbatch;      // how many batches done to cover all ntrans vectors
-  
-  BIGINT ms;       // number of modes in x (1) dir (historical CMCL name) = N1
-  BIGINT mt;       // number of modes in y (2) direction = N2
-  BIGINT mu;       // number of modes in z (3) direction = N3
-  BIGINT N;        // total # modes (prod of above three)
-  
-  BIGINT nf1;      // size of internal fine grid in x (1) direction
-  BIGINT nf2;      // " y (2)
-  BIGINT nf3;      // " z (3)
-  BIGINT nf;       // total # fine grid points (product of the above three)
-  
-  int fftSign;     // sign in exponential for NUFFT defn, guaranteed to be +-1
-
-  FLT* phiHat1;    // FT of kernel in t1,2, on x-axis mode grid
-  FLT* phiHat2;    // " y-axis.
-  FLT* phiHat3;    // " z-axis.
-  
-  FFTW_CPX* fwBatch;    // (batches of) fine grid(s) for FFTW to plan
-                        // & act on. Usually the largest working array
-  
-  BIGINT *sortIndices;  // precomputed NU pt permutation, speeds spread/interp
-  bool didSort;         // whether binsorting used (false: identity perm used)
-
-  FLT *X, *Y, *Z;  // for t1,2: ptr to user-supplied NU pts (no new allocs).
-                   // for t3: allocated as "primed" (scaled) src pts x'_j, etc
+typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++
+
+  int type;                     // transform type (Rokhlin naming): 1,2 or 3
+  int dim;                      // overall dimension: 1,2 or 3
+  int ntrans;          // how many transforms to do at once (vector or "many" mode)
+  BIGINT nj;           // num of NU pts in type 1,2 (for type 3, num input x pts)
+  BIGINT nk;           // number of NU freq pts (type 3 only)
+  FLT tol;             // relative user tolerance
+  int batchSize;       // # strength vectors to group together for FFTW, etc
+  int nbatch;          // how many batches done to cover all ntrans vectors
+
+  BIGINT ms;           // number of modes in x (1) dir (historical CMCL name) = N1
+  BIGINT mt;           // number of modes in y (2) direction = N2
+  BIGINT mu;           // number of modes in z (3) direction = N3
+  BIGINT N;            // total # modes (prod of above three)
+
+  BIGINT nf1;          // size of internal fine grid in x (1) direction
+  BIGINT nf2;          // " y (2)
+  BIGINT nf3;          // " z (3)
+  BIGINT nf;           // total # fine grid points (product of the above three)
+
+  int fftSign;         // sign in exponential for NUFFT defn, guaranteed to be +-1
+
+  FLT *phiHat1;        // FT of kernel in t1,2, on x-axis mode grid
+  FLT *phiHat2;        // " y-axis.
+  FLT *phiHat3;        // " z-axis.
+
+  FFTW_CPX *fwBatch;   // (batches of) fine grid(s) for FFTW to plan
+                       // & act on. Usually the largest working array
+
+  BIGINT *sortIndices; // precomputed NU pt permutation, speeds spread/interp
+  bool didSort;        // whether binsorting used (false: identity perm used)
+
+  FLT *X, *Y, *Z;      // for t1,2: ptr to user-supplied NU pts (no new allocs).
+                       // for t3: allocated as "primed" (scaled) src pts x'_j, etc
 
   // type 3 specific
-  FLT *S, *T, *U;  // pointers to user's target NU pts arrays (no new allocs)
-  CPX* prephase;   // pre-phase, for all input NU pts
-  CPX* deconv;     // reciprocal of kernel FT, phase, all output NU pts
-  CPX* CpBatch;    // working array of prephased strengths
-  FLT *Sp, *Tp, *Up;    // internal primed targs (s'_k, etc), allocated
-  TYPE3PARAMS t3P; // groups together type 3 shift, scale, phase, parameters
-  FINUFFT_PLAN innerT2plan;   // ptr used for type 2 in step 2 of type 3
-  
+  FLT *S, *T, *U;           // pointers to user's target NU pts arrays (no new allocs)
+  CPX *prephase;            // pre-phase, for all input NU pts
+  CPX *deconv;              // reciprocal of kernel FT, phase, all output NU pts
+  CPX *CpBatch;             // working array of prephased strengths
+  FLT *Sp, *Tp, *Up;        // internal primed targs (s'_k, etc), allocated
+  TYPE3PARAMS t3P;          // groups together type 3 shift, scale, phase, parameters
+  FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3
+
   // other internal structs; each is C-compatible of course
   FFTW_PLAN fftwPlan;
-  finufft_opts opts;    // this and spopts could be made ptrs
+  finufft_opts opts; // this and spopts could be made ptrs
   finufft_spread_opts spopts;
-  
+
 } FINUFFT_PLAN_S;
 
 #undef TYPE3PARAMS
 
-
-#endif  // DEFS_H
+#endif // DEFS_H
diff --git a/include/finufft/dirft.h b/include/finufft/dirft.h
index 88f1dd2df..5d13265a4 100644
--- a/include/finufft/dirft.h
+++ b/include/finufft/dirft.h
@@ -3,16 +3,20 @@
 
 #include <finufft/defs.h>
 
-void dirft1d1(BIGINT nj,FLT* x,CPX* c,int isign,BIGINT ms, CPX* f);
-void dirft1d2(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f);
-void dirft1d3(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT nk, FLT* s, CPX* f);
+void dirft1d1(BIGINT nj, FLT *x, CPX *c, int isign, BIGINT ms, CPX *f);
+void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f);
+void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f);
 
-void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f);
-void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f);
-void dirft2d3(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, CPX* f);
+void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f);
+void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f);
+void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t,
+              CPX *f);
 
-void dirft3d1(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f);
-void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f);
-void dirft3d3(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, FLT *u, CPX* f);
+void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
+              BIGINT mu, CPX *f);
+void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
+              BIGINT mu, CPX *f);
+void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s,
+              FLT *t, FLT *u, CPX *f);
 
 #endif
diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h
index 89d86f0de..1771ff259 100644
--- a/include/finufft/fftw_defs.h
+++ b/include/finufft/fftw_defs.h
@@ -7,42 +7,42 @@
 // precision library compilation, which need different FFTW command symbols.
 // Barnett simplified via FFTWIFY, 6/7/22.
 
-#include <fftw3.h>          // (after complex.h) needed so can typedef FFTW_CPX
+#include <fftw3.h> // (after complex.h) needed so can typedef FFTW_CPX
 
 // precision-switching names for interfaces to FFTW...
 #ifdef SINGLE
-  // macro to prepend fftw_ (for double) or fftwf_ (for single) to a string
-  // without a space. The 2nd level of indirection is needed for safety, see:
-  // https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
-  #define FFTWIFY_UNSAFE(x) fftwf_##x
+// macro to prepend fftw_ (for double) or fftwf_ (for single) to a string
+// without a space. The 2nd level of indirection is needed for safety, see:
+// https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
+#define FFTWIFY_UNSAFE(x) fftwf_##x
 #else
-  #define FFTWIFY_UNSAFE(x) fftw_##x
+#define FFTWIFY_UNSAFE(x) fftw_##x
 #endif
-#define FFTWIFY(x) FFTWIFY_UNSAFE(x)
+#define FFTWIFY(x)         FFTWIFY_UNSAFE(x)
 // now use this tool (note we replaced typedefs v<=2.0.4, in favor of macros):
-#define FFTW_CPX FFTWIFY(complex)
-#define FFTW_PLAN FFTWIFY(plan)
-#define FFTW_ALLOC_RE FFTWIFY(alloc_real)
-#define FFTW_ALLOC_CPX FFTWIFY(alloc_complex)
-#define FFTW_PLAN_1D FFTWIFY(plan_dft_1d)
-#define FFTW_PLAN_2D FFTWIFY(plan_dft_2d)
-#define FFTW_PLAN_3D FFTWIFY(plan_dft_3d)
+#define FFTW_CPX           FFTWIFY(complex)
+#define FFTW_PLAN          FFTWIFY(plan)
+#define FFTW_ALLOC_RE      FFTWIFY(alloc_real)
+#define FFTW_ALLOC_CPX     FFTWIFY(alloc_complex)
+#define FFTW_PLAN_1D       FFTWIFY(plan_dft_1d)
+#define FFTW_PLAN_2D       FFTWIFY(plan_dft_2d)
+#define FFTW_PLAN_3D       FFTWIFY(plan_dft_3d)
 #define FFTW_PLAN_MANY_DFT FFTWIFY(plan_many_dft)
-#define FFTW_EX FFTWIFY(execute)
-#define FFTW_DE FFTWIFY(destroy_plan)
-#define FFTW_FR FFTWIFY(free)
+#define FFTW_EX            FFTWIFY(execute)
+#define FFTW_DE            FFTWIFY(destroy_plan)
+#define FFTW_FR            FFTWIFY(free)
 #define FFTW_FORGET_WISDOM FFTWIFY(forget_wisdom)
-#define FFTW_CLEANUP FFTWIFY(cleanup)
+#define FFTW_CLEANUP       FFTWIFY(cleanup)
 // the following OMP switch could be done in the src code instead...
 #ifdef _OPENMP
-  #define FFTW_INIT FFTWIFY(init_threads)
-  #define FFTW_PLAN_TH FFTWIFY(plan_with_nthreads)
-  #define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads)
+#define FFTW_INIT            FFTWIFY(init_threads)
+#define FFTW_PLAN_TH         FFTWIFY(plan_with_nthreads)
+#define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads)
 #else
-  // no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls...
-  #define FFTW_INIT()
-  #define FFTW_PLAN_TH(x)
-  #define FFTW_CLEANUP_THREADS()
+// no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls...
+#define FFTW_INIT()
+#define FFTW_PLAN_TH(x)
+#define FFTW_CLEANUP_THREADS()
 #endif
 
-#endif  // FFTW_DEFS_H
+#endif // FFTW_DEFS_H
diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h
index 853b6c2b1..0900dd31b 100644
--- a/include/finufft/spreadinterp.h
+++ b/include/finufft/spreadinterp.h
@@ -26,32 +26,38 @@
 #define TF_OMIT_SPREADING            8 // don't interp/spread (dir=1: to subgrids)
 
 namespace finufft {
-  namespace spreadinterp {
+namespace spreadinterp {
 
 // things external (spreadinterp) interface needs...
-FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform,
-		 BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		 FLT *data_nonuniform, finufft_spread_opts opts);
-FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3,
-                 BIGINT M, FLT *kx, FLT *ky, FLT *kz, finufft_spread_opts opts);
-FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, 
-               FLT *kx, FLT *ky, FLT *kz, finufft_spread_opts opts);
-FINUFFT_EXPORT int FINUFFT_CDECL interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, 
-		      FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		 FLT *data_nonuniform, finufft_spread_opts opts, int did_sort);
-FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, 
-		      FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		 FLT *data_nonuniform, finufft_spread_opts opts, int did_sort);
-FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, 
-                       FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-                       FLT *data_nonuniform, finufft_spread_opts opts,
-                       int did_sort);
-FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x,const finufft_spread_opts &opts);
-FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel_noexp(FLT x,const finufft_spread_opts &opts);
-FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts,FLT eps,double upsampfac,
-                   int kerevalmeth, int debug, int showwarn, int dim);
-
-  }    // namespace
-}    // namespace
- 
-#endif  // SPREADINTERP_H
+FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp(
+    BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky,
+    FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts);
+FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
+                                             FLT *kx, FLT *ky, FLT *kz,
+                                             finufft_spread_opts opts);
+FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2,
+                                           BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
+                                           finufft_spread_opts opts);
+FINUFFT_EXPORT int FINUFFT_CDECL interpSorted(
+    BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M,
+    FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts,
+    int did_sort);
+FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted(
+    BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M,
+    FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts,
+    int did_sort);
+FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(
+    BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M,
+    FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts,
+    int did_sort);
+FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts);
+FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel_noexp(FLT x,
+                                                       const finufft_spread_opts &opts);
+FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, FLT eps,
+                                                double upsampfac, int kerevalmeth,
+                                                int debug, int showwarn, int dim);
+
+} // namespace spreadinterp
+} // namespace finufft
+
+#endif // SPREADINTERP_H
diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h
index 54b058266..6142eadfb 100644
--- a/include/finufft/test_defs.h
+++ b/include/finufft/test_defs.h
@@ -7,7 +7,7 @@
 
 // TESTER SETTINGS...
 // how big a problem to check direct DFT for in 1D...
-#define TEST_BIGPROB 1e8
+#define TEST_BIGPROB   1e8
 // for omp rand filling
 #define TEST_RANDCHUNK 1000000
 
@@ -25,11 +25,11 @@
 #include <finufft/fftw_defs.h>
 
 // std stuff for tester src
-#include <math.h>
-#include <stdlib.h>
 #include <cstdio>
-#include <iostream>
 #include <iomanip>
+#include <iostream>
+#include <math.h>
+#include <stdlib.h>
 #include <vector>
 
-#endif   // TEST_DEFS_H
+#endif // TEST_DEFS_H
diff --git a/include/finufft/utils.h b/include/finufft/utils.h
index 8c2b7619e..9039fee96 100644
--- a/include/finufft/utils.h
+++ b/include/finufft/utils.h
@@ -7,18 +7,19 @@
 #include "finufft/defs.h"
 
 namespace finufft {
-  namespace utils {
+namespace utils {
 
 // ahb's low-level array helpers
-FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX* a, CPX* b);
-FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX* a, CPX* b);
-FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX* a);
-FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX* a);
-FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi);
-FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT* i, FLT* a, FLT *lo, FLT *hi);
-FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c);
+FINUFFT_EXPORT FLT FINUFFT_CDECL relerrtwonorm(BIGINT n, CPX *a, CPX *b);
+FINUFFT_EXPORT FLT FINUFFT_CDECL errtwonorm(BIGINT n, CPX *a, CPX *b);
+FINUFFT_EXPORT FLT FINUFFT_CDECL twonorm(BIGINT n, CPX *a);
+FINUFFT_EXPORT FLT FINUFFT_CDECL infnorm(BIGINT n, CPX *a);
+FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi);
+FINUFFT_EXPORT void FINUFFT_CDECL indexedarrayrange(BIGINT n, BIGINT *i, FLT *a, FLT *lo,
+                                                    FLT *hi);
+FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c);
 
-  }    // namespace
-}    // namespace
- 
-#endif  // UTILS_H
+} // namespace utils
+} // namespace finufft
+
+#endif // UTILS_H
diff --git a/include/finufft/utils_precindep.h b/include/finufft/utils_precindep.h
index 866d33198..0504bb8df 100644
--- a/include/finufft/utils_precindep.h
+++ b/include/finufft/utils_precindep.h
@@ -10,34 +10,35 @@
 #include <chrono>
 
 namespace finufft {
-  namespace utils {
-  
-  FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n);
-
-  // jfm's timer class
-  class FINUFFT_EXPORT CNTime {
-  public:
-    void start();
-    double restart();
-    double elapsedsec();
-  private:
-    double initial;
-  };
-
-  // openmp helpers
-  int get_num_threads_parallel_block();
-    
-  } //namespace
-} //namespace
-  
+namespace utils {
+
+FINUFFT_EXPORT BIGINT FINUFFT_CDECL next235even(BIGINT n);
+
+// jfm's timer class
+class FINUFFT_EXPORT CNTime {
+public:
+  void start();
+  double restart();
+  double elapsedsec();
+
+private:
+  double initial;
+};
+
+// openmp helpers
+int get_num_threads_parallel_block();
+
+} // namespace utils
+} // namespace finufft
+
 // thread-safe rand number generator for Windows platform
 #ifdef _WIN32
 #include <random>
 namespace finufft {
-  namespace utils {
-  FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp);
-  }   // namespace
-}   // namespace
+namespace utils {
+FINUFFT_EXPORT int FINUFFT_CDECL rand_r(unsigned int *seedp);
+} // namespace utils
+} // namespace finufft
 #endif
 
-#endif  // UTILS_PRECINDEP_H
+#endif // UTILS_PRECINDEP_H
diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h
index 25703fb1d..250dec7c0 100644
--- a/include/finufft_eitherprec.h
+++ b/include/finufft_eitherprec.h
@@ -15,26 +15,26 @@
 // The 2nd level of indirection is needed for safety, see:
 // https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
 #define FINUFFTIFY_UNSAFE(x) finufftf##x
-#define FINUFFT_FLT float
+#define FINUFFT_FLT          float
 #else
 #define FINUFFTIFY_UNSAFE(x) finufft##x
-#define FINUFFT_FLT double
+#define FINUFFT_FLT          double
 #endif
 #define FINUFFTIFY(x) FINUFFTIFY_UNSAFE(x)
 
 // decide which kind of complex numbers FINUFFT_CPX is (four options)
 #ifdef __cplusplus
 #define _USE_MATH_DEFINES
-#include <complex>          // C++ type
+#include <complex> // C++ type
 #define FINUFFT_COMPLEXIFY(X) std::complex<X>
 #else
-#include <complex.h>        // C99 type
+#include <complex.h> // C99 type
 #define FINUFFT_COMPLEXIFY(X) X complex
 #endif
-#define FINUFFT_CPX FINUFFT_COMPLEXIFY(FINUFFT_FLT)
+#define FINUFFT_CPX    FINUFFT_COMPLEXIFY(FINUFFT_FLT)
 
 // opaque pointer to finufft_plan private object, for this precision...
-#define FINUFFT_PLAN FINUFFTIFY(_plan)
+#define FINUFFT_PLAN   FINUFFTIFY(_plan)
 // the plan object pointed to... (doesn't need to be even defined here)
 #define FINUFFT_PLAN_S FINUFFTIFY(_plan_s)
 
@@ -51,13 +51,13 @@
    with it in the future we just need to update cmake for it to work
    instead of having a check on the msvc version. */
 #if defined(FINUFFT_DLL) && (defined(_WIN32) || defined(__WIN32__))
-#  if defined(dll_EXPORTS)
-#    define FINUFFT_EXPORT __declspec(dllexport)
-#  else
-#    define FINUFFT_EXPORT __declspec(dllimport)
-#  endif
+#if defined(dll_EXPORTS)
+#define FINUFFT_EXPORT __declspec(dllexport)
 #else
-#  define FINUFFT_EXPORT
+#define FINUFFT_EXPORT __declspec(dllimport)
+#endif
+#else
+#define FINUFFT_EXPORT
 #endif
 
 /* specify calling convention (Windows only)
@@ -66,81 +66,115 @@
    If the user code changes the default compiler calling convention, may need
    this when generating DLL. */
 #if defined(_WIN32) || defined(__WIN32__)
-#  define FINUFFT_CDECL __cdecl
+#define FINUFFT_CDECL __cdecl
 #else
-#  define FINUFFT_CDECL
+#define FINUFFT_CDECL
 #endif
 
 ////////////////////////////////////////////////////////////////////
 // PUBLIC METHOD INTERFACES. All are C-style even when used from C++...
 #ifdef __cplusplus
-extern "C"
-{
+extern "C" {
 #endif
 
-// ----------------- the plan ----------------------------------------------- 
+// ----------------- the plan -----------------------------------------------
 // the plan handle that we pass around is just a pointer to the plan object
 // that contains all the info. The latter is invisible to the public user.
-typedef struct FINUFFT_PLAN_S * FINUFFT_PLAN;
+typedef struct FINUFFT_PLAN_S *FINUFFT_PLAN;
 
-  
 // ------------------ the guru interface ------------------------------------
 // (sources in finufft.cpp)
-  
-  FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)(int type, int dim, FINUFFT_BIGINT* n_modes, int iflag, int n_transf, FINUFFT_FLT tol, FINUFFT_PLAN* plan, finufft_opts* o);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)(FINUFFT_PLAN plan , FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_execute)(FINUFFT_PLAN plan, FINUFFT_CPX* weights, FINUFFT_CPX* result);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_destroy)(FINUFFT_PLAN plan);
 
+FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)(
+    int type, int dim, FINUFFT_BIGINT *n_modes, int iflag, int n_transf, FINUFFT_FLT tol,
+    FINUFFT_PLAN *plan, finufft_opts *o);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_setpts)(
+    FINUFFT_PLAN plan, FINUFFT_BIGINT M, FINUFFT_FLT *xj, FINUFFT_FLT *yj,
+    FINUFFT_FLT *zj, FINUFFT_BIGINT N, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_FLT *u);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_execute)(
+    FINUFFT_PLAN plan, FINUFFT_CPX *weights, FINUFFT_CPX *result);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_destroy)(FINUFFT_PLAN plan);
 
 // ----------------- the 18 simple interfaces -------------------------------
 // (sources in simpleinterfaces.cpp)
 
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms,
-                      FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms,
-                         FINUFFT_CPX* fk, finufft_opts *opts);
-
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms,
-                      FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT ms,
-                          FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_CPX* c,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_CPX* f, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_CPX* c,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_CPX* f, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1many)(int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT* xj, FINUFFT_FLT *yj, FINUFFT_CPX* c, int iflag,
-                   FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2many)(int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT* xj, FINUFFT_FLT *yj, FINUFFT_CPX* c, int iflag,
-                   FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_CPX* fk, finufft_opts *opts);
-
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,FINUFFT_BIGINT nk, FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_CPX* fk, finufft_opts *opts);
-
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1many)(int ntransfs, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts);
-
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2)(FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* xj,FINUFFT_FLT *yj,FINUFFT_FLT *zj,FINUFFT_CPX* cj,int iflag,FINUFFT_FLT eps,
-	       FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu, FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3)(FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_FLT *z, FINUFFT_CPX* cj,int iflag,
-	       FINUFFT_FLT eps,FINUFFT_BIGINT nk,FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_FLT *u,
-	       FINUFFT_CPX* fk, finufft_opts *opts);
-  FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3many)(int ntransf, FINUFFT_BIGINT nj,FINUFFT_FLT* x,FINUFFT_FLT *y,FINUFFT_FLT *z, FINUFFT_CPX* cj,int iflag,
-	       FINUFFT_FLT eps,FINUFFT_BIGINT nk,FINUFFT_FLT* s, FINUFFT_FLT* t, FINUFFT_FLT *u,
-	       FINUFFT_CPX* fk, finufft_opts *opts);
-  
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps,
+    FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts);
+
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps,
+    FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d2many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_CPX *c, int iflag, FINUFFT_FLT eps,
+    FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_CPX *f, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d3many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_CPX *c, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_CPX *f,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d1many)(
+    int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *c,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d2many)(
+    int ndata, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_CPX *c,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_CPX *cj, int iflag,
+    FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t, FINUFFT_CPX *fk,
+    finufft_opts *opts);
+
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(2d3many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_CPX *cj,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t,
+    FINUFFT_CPX *fk, finufft_opts *opts);
+
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_CPX *cj,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu,
+    FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d1many)(
+    int ntransfs, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj,
+    FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt,
+    FINUFFT_BIGINT mu, FINUFFT_CPX *fk, finufft_opts *opts);
+
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj, FINUFFT_CPX *cj,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt, FINUFFT_BIGINT mu,
+    FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d2many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_FLT *yj, FINUFFT_FLT *zj,
+    FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT ms, FINUFFT_BIGINT mt,
+    FINUFFT_BIGINT mu, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3)(
+    FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_FLT *z, FINUFFT_CPX *cj,
+    int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s, FINUFFT_FLT *t,
+    FINUFFT_FLT *u, FINUFFT_CPX *fk, finufft_opts *opts);
+FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(3d3many)(
+    int ntransf, FINUFFT_BIGINT nj, FINUFFT_FLT *x, FINUFFT_FLT *y, FINUFFT_FLT *z,
+    FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, FINUFFT_BIGINT nk, FINUFFT_FLT *s,
+    FINUFFT_FLT *t, FINUFFT_FLT *u, FINUFFT_CPX *fk, finufft_opts *opts);
+
 #ifdef __cplusplus
 }
 #endif
 
-
 // clean up things that were purely local to this file
 #undef FINUFFT_COMPLEXIFY
 #undef FINUFFTIFY_UNSAFE
diff --git a/include/finufft_opts.h b/include/finufft_opts.h
index 4f6db1e02..0435b8c41 100644
--- a/include/finufft_opts.h
+++ b/include/finufft_opts.h
@@ -5,19 +5,18 @@
 #ifndef FINUFFT_OPTS_H
 #define FINUFFT_OPTS_H
 
-
-typedef struct finufft_opts{  // defaults see finufft.cpp:finufft_default_opts()
+typedef struct finufft_opts { // defaults see finufft.cpp:finufft_default_opts()
   // sphinx tag (don't remove): @opts_start
   // FINUFFT options:
   // data handling opts...
-  int modeord;            // (type 1,2 only): 0 CMCL-style increasing mode order
-                          //                  1 FFT-style mode order
-  int chkbnds;            // [DEPRECATED] 0 don't check NU pts in [-3pi,3pi), 1 do (<few % slower)
-  
+  int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
+               //                  1 FFT-style mode order
+  int chkbnds; // [DEPRECATED] 0 don't check NU pts in [-3pi,3pi), 1 do (<few % slower)
+
   // diagnostic opts...
-  int debug;              // 0 silent, 1 some timing/debug, or 2 more
-  int spread_debug;       // spreader: 0 silent, 1 some timing/debug, or 2 tonnes
-  int showwarn;           // 0 don't print warnings to stderr, 1 do
+  int debug;        // 0 silent, 1 some timing/debug, or 2 more
+  int spread_debug; // spreader: 0 silent, 1 some timing/debug, or 2 tonnes
+  int showwarn;     // 0 don't print warnings to stderr, 1 do
 
   // algorithm performance opts...
   int nthreads;           // number of threads to use, or 0 uses all available
@@ -29,23 +28,30 @@ typedef struct finufft_opts{  // defaults see finufft.cpp:finufft_default_opts()
   int spread_thread;      // (vectorized ntr>1 only): 0 auto, 1 seq multithreaded,
                           //                          2 parallel single-thread spread
   int maxbatchsize;       // (vectorized ntr>1 only): max transform batch, 0 auto
-  int spread_nthr_atomic; // if >=0, threads above which spreader OMP critical goes atomic
+  int spread_nthr_atomic; // if >=0, threads above which spreader OMP critical goes
+                          // atomic
   int spread_max_sp_size; // if >0, overrides spreader (dir=1) max subproblem size
-  // sphinx tag (don't remove): @opts_end
+                          // sphinx tag (don't remove): @opts_end
 } finufft_opts;
 
 // Those of the above of the form spread_* indicate pass through to finufft_spread_opts
 
 // define deprecated opts macro
 #if defined(__cplusplus) && (__cplusplus >= 201402L)
-#define DEPRECATED_OPTS [[deprecated ("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead.")]]
+#define DEPRECATED_OPTS                                                          \
+  [[deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; " \
+               "please use this instead.")]]
 #elif defined(_MSC_VER)
-#define DEPRECATED_OPTS __declspec(deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead."))
+#define DEPRECATED_OPTS                                                     \
+  __declspec(deprecated("as of v2.1.0, nufft_opts is obsolete and renamed " \
+                        "finufft_opts; please use this instead."))
 #else
-#define DEPRECATED_OPTS __attribute__((deprecated("as of v2.1.0, nufft_opts is obsolete and renamed finufft_opts; please use this instead.")))
+#define DEPRECATED_OPTS                                                         \
+  __attribute__((deprecated("as of v2.1.0, nufft_opts is obsolete and renamed " \
+                            "finufft_opts; please use this instead.")))
 #endif
 
 // Backwards-compatibility
 DEPRECATED_OPTS typedef finufft_opts nufft_opts;
 
-#endif  // FINUFFT_OPTS_H
+#endif // FINUFFT_OPTS_H
diff --git a/include/finufft_spread_opts.h b/include/finufft_spread_opts.h
index 8549505db..2f3c9ce76 100644
--- a/include/finufft_spread_opts.h
+++ b/include/finufft_spread_opts.h
@@ -10,25 +10,25 @@
 typedef struct finufft_spread_opts {
   // See spreadinterp:setup_spreader for default values of the following fields.
   // This is the main documentation for these options...
-  int nspread;            // w, the kernel width in grid pts
-  int spread_direction;   // 1 means spread NU->U, 2 means interpolate U->NU
-  int chkbnds;            // [DEPRECATED] 0: don't check NU pts in 3-period range; 1: do
-  int sort;               // 0: don't sort NU pts, 1: do, 2: heuristic choice
-  int kerevalmeth;        // 0: direct exp(sqrt()), or 1: Horner ppval, fastest
-  int kerpad;             // 0: no pad w to mult of 4, 1: do pad
-                          // (this helps SIMD for kerevalmeth=0, eg on i7).
-  int nthreads;           // # threads for spreadinterp (0: use max avail)
-  int sort_threads;       // # threads for sort (0: auto-choice up to nthreads)
+  int nspread;             // w, the kernel width in grid pts
+  int spread_direction;    // 1 means spread NU->U, 2 means interpolate U->NU
+  int chkbnds;             // [DEPRECATED] 0: don't check NU pts in 3-period range; 1: do
+  int sort;                // 0: don't sort NU pts, 1: do, 2: heuristic choice
+  int kerevalmeth;         // 0: direct exp(sqrt()), or 1: Horner ppval, fastest
+  int kerpad;              // 0: no pad w to mult of 4, 1: do pad
+                           // (this helps SIMD for kerevalmeth=0, eg on i7).
+  int nthreads;            // # threads for spreadinterp (0: use max avail)
+  int sort_threads;        // # threads for sort (0: auto-choice up to nthreads)
   int max_subproblem_size; // # pts per t1 subprob; sets extra RAM per thread
-  int flags;              // binary flags for timing only (may give wrong ans
-                          // if changed from 0!). See spreadinterp.h
-  int debug;              // 0: silent, 1: small text output, 2: verbose
-  int atomic_threshold;   // num threads before switching spreadSorted to using atomic ops
-  double upsampfac;       // sigma, upsampling factor
+  int flags;               // binary flags for timing only (may give wrong ans
+                           // if changed from 0!). See spreadinterp.h
+  int debug;               // 0: silent, 1: small text output, 2: verbose
+  int atomic_threshold; // num threads before switching spreadSorted to using atomic ops
+  double upsampfac;     // sigma, upsampling factor
   // ES kernel specific consts for eval. No longer FLT, to avoid name clash...
   double ES_beta;
   double ES_halfwidth;
   double ES_c;
 } finufft_spread_opts;
 
-#endif   // FINUFFT_SPREAD_OPTS_H
+#endif // FINUFFT_SPREAD_OPTS_H
diff --git a/matlab/finufft.cpp b/matlab/finufft.cpp
index 9a805dade..ccbc9a59a 100644
--- a/matlab/finufft.cpp
+++ b/matlab/finufft.cpp
@@ -31,9 +31,9 @@
   THE SOFTWARE.
 */
 
+#include <stddef.h>
 #include <stdio.h>
 #include <string.h>
-#include <stddef.h>
 
 #include <mex.h>
 
@@ -41,12 +41,10 @@
 #include <matrix.h>
 #endif
 
-
 /*
  * Records for call profile.
  */
-int* mexprofrecord_= NULL;
-
+int *mexprofrecord_ = NULL;
 
 /*
  * Support routines for copying data into and out of the MEX stubs, R2018a
@@ -54,502 +52,421 @@ int* mexprofrecord_= NULL;
 
 #if MX_HAS_INTERLEAVED_COMPLEX
 
-void* mxWrapGetP(const mxArray* a, const char* fmt, const char** e)
-{
-    void* p = NULL;
+void *mxWrapGetP(const mxArray *a, const char *fmt, const char **e) {
+  void *p = NULL;
 #ifdef R2008OO
-    mxArray* ap;
+  mxArray *ap;
 #endif
-    if (mxGetClassID(a) == mxDOUBLE_CLASS && mxIsComplex(a) )
-    {
-        if( mxGetM(a)*mxGetN(a) == 1 && (*mxGetComplexDoubles(a)).real == 0 )
-        return NULL;
-    }
-    if (mxGetClassID(a) == mxDOUBLE_CLASS && !mxIsComplex(a) )
-    {
-        if( mxGetM(a)*mxGetN(a) == 1 && *mxGetDoubles(a) == 0)
-        return NULL;
-    }
-    if (mxIsChar(a)) {
-        char pbuf[128];
-        mxGetString(a, pbuf, sizeof(pbuf));
-        sscanf(pbuf, fmt, &p);
-    } 
+  if (mxGetClassID(a) == mxDOUBLE_CLASS && mxIsComplex(a)) {
+    if (mxGetM(a) * mxGetN(a) == 1 && (*mxGetComplexDoubles(a)).real == 0) return NULL;
+  }
+  if (mxGetClassID(a) == mxDOUBLE_CLASS && !mxIsComplex(a)) {
+    if (mxGetM(a) * mxGetN(a) == 1 && *mxGetDoubles(a) == 0) return NULL;
+  }
+  if (mxIsChar(a)) {
+    char pbuf[128];
+    mxGetString(a, pbuf, sizeof(pbuf));
+    sscanf(pbuf, fmt, &p);
+  }
 #ifdef R2008OO
-    else if (ap = mxGetProperty(a, 0, "mwptr")) {
-        return mxWrapGetP(ap, fmt, e);
-    }
+  else if (ap = mxGetProperty(a, 0, "mwptr")) {
+    return mxWrapGetP(ap, fmt, e);
+  }
 #endif
-    if (p == 0)
-        *e = "Invalid pointer";
-    return p;
-}
-
-mxArray* mxWrapCreateP(void* p, const char* fmt)
-{
-    if (p == 0) {
-        mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL);
-        *mxGetDoubles(z) = 0;
-        return z;
-    } else {
-        char pbuf[128];
-        sprintf(pbuf, fmt, p);
-        return mxCreateString(pbuf);
-    }
-}
-
-mxArray* mxWrapStrncpy(const char* s)
-{
-    if (s) {
-        return mxCreateString(s);
-    } else {
-        mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL);
-        *mxGetDoubles(z) = 0;
-        return z;
-    }
-}
-
-char* mxWrapGetString(const mxArray* a, const char** e)
-{
-    char* s;
-    mwSize slen;
-    if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) {
-        *e = "Invalid string argument";
-        return NULL;
-    }
-    slen = mxGetM(a)*mxGetN(a) + 1;
-    s = (char*) mxMalloc(slen);
-    if (mxGetM(a)*mxGetN(a) == 0)
-        *s = 0;
-    else
-        mxGetString(a, s, slen);
-    return s;
-}
-
-
-double mxWrapGetScalar(const mxArray* a, const char** e)
-{
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a)*mxGetN(a) != 1) {
-        *e = "Invalid scalar argument";
-        return 0;
-    }
-    if( mxIsComplex(a) )
-      return (double) (*mxGetComplexDoubles(a)).real;
-    else
-      return (double) (*mxGetDoubles(a));
-}
-
-#define mxWrapGetArrayDef(func, T) \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    double* q; \
-    mxComplexDouble* z; \
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \
-        *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexDoubles(a);	   \
-	for (i = 0; i < arraylen; ++i)		\
-	  *p++ = (T) (*z++).real;			\
-      } \
-    else \
-      {				   \
-	q = mxGetDoubles(a);	   \
-	for (i = 0; i < arraylen; ++i)		\
-	  *p++ = (T) (*q++);			\
-      } \
-    return array; \
-}
-
-
-#define mxWrapCopyDef(func, T) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p;	\
-    mxComplexDouble* z; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexDoubles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  (*z++).real = (double) *q++;	\
-	  (*z++).imag = 0;	\
-      } \
-    else \
-      {				   \
-	p = mxGetDoubles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  *p++ = (double) *q++;		\
-      } \
-}
-
-
-#define mxWrapReturnDef(func, T) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p; \
-    if (!q) { \
-        return mxCreateDoubleMatrix(0,0, mxREAL); \
-    } else { \
-        mxArray* a = mxCreateDoubleMatrix(m,n, mxREAL); \
-        p = mxGetDoubles(a); \
-        for (i = 0; i < m*n; ++i) \
-	  *p++ = (double) *q++;	  \
-        return a; \
-    } \
-}
-
-
-#define mxWrapGetScalarZDef(func, T, ZT, setz)	\
-void func(T* z, const mxArray* a) \
-{ \
-    if( mxIsComplex(a) ) \
-      { \
-  setz(z, (ZT) (*mxGetComplexDoubles(a)).real, (ZT) (*mxGetComplexDoubles(a)).imag); \
-      } \
-    else \
-      {				   \
-  setz(z, (ZT) (*mxGetComplexDoubles(a)).real, (ZT) 0);	\
-      } \
-}
-
-
-#define mxWrapGetArrayZDef(func, T, ZT, setz)      \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    double* q; \
-    mxComplexDouble* z; \
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \
-        *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexDoubles(a);	   \
-	for (i = 0; i < arraylen; ++i) {	\
-	  setz(p, (ZT) (*z).real, (ZT) (*z).imag);	\
-  	  ++p; ++z; }					\
-      } \
-    else \
-      {				   \
-	q = mxGetDoubles(a);	   \
-	for (i = 0; i < arraylen; ++i)	{	\
-	  setz(p, (ZT) (*q), (ZT) 0 );		\
-          ++p; ++q; }			\
-      }						\
-    return array; \
-}
-
-
-#define mxWrapCopyZDef(func, T, freal, fimag)	    \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p;	\
-    mxComplexDouble* z; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexDoubles(a);	   \
-	for (i = 0; i < n; ++i)	{		\
-          (*z).real = freal(*q);			\
-	  (*z).imag = fimag(*q);			\
-	  ++z; ++q; 	}			\
-      } \
-    else \
-      {				   \
-	p = mxGetDoubles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  *p++ = freal(*q++);		\
-      } \
-}
-
-
-#define mxWrapReturnZDef(func, T, freal, fimag)	      \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    mxComplexDouble* p; \
-    if (!q) { \
-        return mxCreateDoubleMatrix(0,0, mxCOMPLEX); \
-    } else { \
-        mxArray* a = mxCreateDoubleMatrix(m,n, mxCOMPLEX); \
-        p = mxGetComplexDoubles(a); \
-        for (i = 0; i < m*n; ++i) {	  \
-          (*p).real = freal(*q);			\
-	  (*p).imag = fimag(*q);			\
-	  ++p; ++q; 	}			\
-        return a; \
-    } \
-}
-
-
-
-
-
-
-void* mxWrapGetP_single(const mxArray* a, const char* fmt, const char** e)
-{
-    void* p = NULL;
+  if (p == 0) *e = "Invalid pointer";
+  return p;
+}
+
+mxArray *mxWrapCreateP(void *p, const char *fmt) {
+  if (p == 0) {
+    mxArray *z       = mxCreateDoubleMatrix(1, 1, mxREAL);
+    *mxGetDoubles(z) = 0;
+    return z;
+  } else {
+    char pbuf[128];
+    sprintf(pbuf, fmt, p);
+    return mxCreateString(pbuf);
+  }
+}
+
+mxArray *mxWrapStrncpy(const char *s) {
+  if (s) {
+    return mxCreateString(s);
+  } else {
+    mxArray *z       = mxCreateDoubleMatrix(1, 1, mxREAL);
+    *mxGetDoubles(z) = 0;
+    return z;
+  }
+}
+
+char *mxWrapGetString(const mxArray *a, const char **e) {
+  char *s;
+  mwSize slen;
+  if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) {
+    *e = "Invalid string argument";
+    return NULL;
+  }
+  slen = mxGetM(a) * mxGetN(a) + 1;
+  s    = (char *)mxMalloc(slen);
+  if (mxGetM(a) * mxGetN(a) == 0)
+    *s = 0;
+  else
+    mxGetString(a, s, slen);
+  return s;
+}
+
+double mxWrapGetScalar(const mxArray *a, const char **e) {
+  if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a) * mxGetN(a) != 1) {
+    *e = "Invalid scalar argument";
+    return 0;
+  }
+  if (mxIsComplex(a))
+    return (double)(*mxGetComplexDoubles(a)).real;
+  else
+    return (double)(*mxGetDoubles(a));
+}
+
+#define mxWrapGetArrayDef(func, T)                               \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    double *q;                                                   \
+    mxComplexDouble *z;                                          \
+    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) {               \
+      *e = "Invalid array argument, mxDOUBLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    if (mxIsComplex(a)) {                                        \
+      z = mxGetComplexDoubles(a);                                \
+      for (i = 0; i < arraylen; ++i) *p++ = (T)(*z++).real;      \
+    } else {                                                     \
+      q = mxGetDoubles(a);                                       \
+      for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++);           \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyDef(func, T)                            \
+  void func(mxArray *a, const T *q, mwSize n) {           \
+    mwIndex i;                                            \
+    double *p;                                            \
+    mxComplexDouble *z;                                   \
+    if (mxIsComplex(a)) {                                 \
+      z = mxGetComplexDoubles(a);                         \
+      for (i = 0; i < n; ++i) (*z++).real = (double)*q++; \
+      (*z++).imag = 0;                                    \
+    } else {                                              \
+      p = mxGetDoubles(a);                                \
+      for (i = 0; i < n; ++i) *p++ = (double)*q++;        \
+    }                                                     \
+  }
+
+#define mxWrapReturnDef(func, T)                       \
+  mxArray *func(const T *q, mwSize m, mwSize n) {      \
+    mwIndex i;                                         \
+    double *p;                                         \
+    if (!q) {                                          \
+      return mxCreateDoubleMatrix(0, 0, mxREAL);       \
+    } else {                                           \
+      mxArray *a = mxCreateDoubleMatrix(m, n, mxREAL); \
+      p          = mxGetDoubles(a);                    \
+      for (i = 0; i < m * n; ++i) *p++ = (double)*q++; \
+      return a;                                        \
+    }                                                  \
+  }
+
+#define mxWrapGetScalarZDef(func, T, ZT, setz)                                         \
+  void func(T *z, const mxArray *a) {                                                  \
+    if (mxIsComplex(a)) {                                                              \
+      setz(z, (ZT)(*mxGetComplexDoubles(a)).real, (ZT)(*mxGetComplexDoubles(a)).imag); \
+    } else {                                                                           \
+      setz(z, (ZT)(*mxGetComplexDoubles(a)).real, (ZT)0);                              \
+    }                                                                                  \
+  }
+
+#define mxWrapGetArrayZDef(func, T, ZT, setz)                    \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    double *q;                                                   \
+    mxComplexDouble *z;                                          \
+    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) {               \
+      *e = "Invalid array argument, mxDOUBLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    if (mxIsComplex(a)) {                                        \
+      z = mxGetComplexDoubles(a);                                \
+      for (i = 0; i < arraylen; ++i) {                           \
+        setz(p, (ZT)(*z).real, (ZT)(*z).imag);                   \
+        ++p;                                                     \
+        ++z;                                                     \
+      }                                                          \
+    } else {                                                     \
+      q = mxGetDoubles(a);                                       \
+      for (i = 0; i < arraylen; ++i) {                           \
+        setz(p, (ZT)(*q), (ZT)0);                                \
+        ++p;                                                     \
+        ++q;                                                     \
+      }                                                          \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyZDef(func, T, freal, fimag)     \
+  void func(mxArray *a, const T *q, mwSize n) {   \
+    mwIndex i;                                    \
+    double *p;                                    \
+    mxComplexDouble *z;                           \
+    if (mxIsComplex(a)) {                         \
+      z = mxGetComplexDoubles(a);                 \
+      for (i = 0; i < n; ++i) {                   \
+        (*z).real = freal(*q);                    \
+        (*z).imag = fimag(*q);                    \
+        ++z;                                      \
+        ++q;                                      \
+      }                                           \
+    } else {                                      \
+      p = mxGetDoubles(a);                        \
+      for (i = 0; i < n; ++i) *p++ = freal(*q++); \
+    }                                             \
+  }
+
+#define mxWrapReturnZDef(func, T, freal, fimag)           \
+  mxArray *func(const T *q, mwSize m, mwSize n) {         \
+    mwIndex i;                                            \
+    mxComplexDouble *p;                                   \
+    if (!q) {                                             \
+      return mxCreateDoubleMatrix(0, 0, mxCOMPLEX);       \
+    } else {                                              \
+      mxArray *a = mxCreateDoubleMatrix(m, n, mxCOMPLEX); \
+      p          = mxGetComplexDoubles(a);                \
+      for (i = 0; i < m * n; ++i) {                       \
+        (*p).real = freal(*q);                            \
+        (*p).imag = fimag(*q);                            \
+        ++p;                                              \
+        ++q;                                              \
+      }                                                   \
+      return a;                                           \
+    }                                                     \
+  }
+
+void *mxWrapGetP_single(const mxArray *a, const char *fmt, const char **e) {
+  void *p = NULL;
 #ifdef R2008OO
-    mxArray* ap;
+  mxArray *ap;
 #endif
-    if (mxGetClassID(a) == mxSINGLE_CLASS && mxIsComplex(a) )
-    {
-        if( mxGetM(a)*mxGetN(a) == 1 && (*mxGetComplexSingles(a)).real == 0 )
-        return NULL;
-    }
-    if (mxGetClassID(a) == mxSINGLE_CLASS && !mxIsComplex(a) )
-    {
-        if( mxGetM(a)*mxGetN(a) == 1 && *mxGetSingles(a) == 0)
-        return NULL;
-    }
-    if (mxIsChar(a)) {
-        char pbuf[128];
-        mxGetString(a, pbuf, sizeof(pbuf));
-        sscanf(pbuf, fmt, &p);
-    } 
+  if (mxGetClassID(a) == mxSINGLE_CLASS && mxIsComplex(a)) {
+    if (mxGetM(a) * mxGetN(a) == 1 && (*mxGetComplexSingles(a)).real == 0) return NULL;
+  }
+  if (mxGetClassID(a) == mxSINGLE_CLASS && !mxIsComplex(a)) {
+    if (mxGetM(a) * mxGetN(a) == 1 && *mxGetSingles(a) == 0) return NULL;
+  }
+  if (mxIsChar(a)) {
+    char pbuf[128];
+    mxGetString(a, pbuf, sizeof(pbuf));
+    sscanf(pbuf, fmt, &p);
+  }
 #ifdef R2008OO
-    else if (ap = mxGetProperty(a, 0, "mwptr")) {
-        return mxWrapGetP(ap, fmt, e);
-    }
+  else if (ap = mxGetProperty(a, 0, "mwptr")) {
+    return mxWrapGetP(ap, fmt, e);
+  }
 #endif
-    if (p == 0)
-        *e = "Invalid pointer";
-    return p;
-}
-
-mxArray* mxWrapCreateP_single(void* p, const char* fmt)
-{
-    if (p == 0) {
-        mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL);
-        *mxGetSingles(z) = 0;
-        return z;
-    } else {
-        char pbuf[128];
-        sprintf(pbuf, fmt, p);
-        return mxCreateString(pbuf);
-    }
-}
-
-mxArray* mxWrapStrncpy_single(const char* s)
-{
-    if (s) {
-        return mxCreateString(s);
-    } else {
-        mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL);
-        *mxGetSingles(z) = 0;
-        return z;
-    }
-}
-
-char* mxWrapGetString_single(const mxArray* a, const char** e)
-{
-    char* s;
-    mwSize slen;
-    if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) {
-        *e = "Invalid string argument";
-        return NULL;
-    }
-    slen = mxGetM(a)*mxGetN(a) + 1;
-    s = (char*) mxMalloc(slen);
-    if (mxGetM(a)*mxGetN(a) == 0)
-        *s = 0;
-    else
-        mxGetString(a, s, slen);
-    return s;
-}
-
-
-float mxWrapGetScalar_single(const mxArray* a, const char** e)
-{
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a)*mxGetN(a) != 1) {
-        *e = "Invalid scalar argument";
-        return 0;
-    }
-    if( mxIsComplex(a) )
-      return (float) (*mxGetComplexSingles(a)).real;
-    else
-      return (float) (*mxGetSingles(a));
-}
-
-#define mxWrapGetArrayDef_single(func, T) \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    float* q; \
-    mxComplexSingle* z; \
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \
-        *e = "Invalid array argument, mxSINGLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexSingles(a);	   \
-	for (i = 0; i < arraylen; ++i)		\
-	  *p++ = (T) (*z++).real;			\
-      } \
-    else \
-      {				   \
-	q = mxGetSingles(a);	   \
-	for (i = 0; i < arraylen; ++i)		\
-	  *p++ = (T) (*q++);			\
-      } \
-    return array; \
-}
-
-
-#define mxWrapCopyDef_single(func, T) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p;	\
-    mxComplexSingle* z; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexSingles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  (*z++).real = (float) *q++;	\
-	  (*z++).imag = 0;	\
-      } \
-    else \
-      {				   \
-	p = mxGetSingles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  *p++ = (float) *q++;		\
-      } \
-}
-
-
-#define mxWrapReturnDef_single(func, T) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p; \
-    if (!q) { \
-        return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxREAL); \
-    } else { \
-        mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxREAL); \
-        p = mxGetSingles(a); \
-        for (i = 0; i < m*n; ++i) \
-	  *p++ = (float) *q++;	  \
-        return a; \
-    } \
-}
-
-
-#define mxWrapGetScalarZDef_single(func, T, ZT, setz)	\
-void func(T* z, const mxArray* a) \
-{ \
-    if( mxIsComplex(a) ) \
-      { \
-  setz(z, (ZT) (*mxGetComplexSingles(a)).real, (ZT) (*mxGetComplexSingles(a)).imag); \
-      } \
-    else \
-      {				   \
-  setz(z, (ZT) (*mxGetComplexSingles(a)).real, (ZT) 0);	\
-      } \
-}
-
-
-#define mxWrapGetArrayZDef_single(func, T, ZT, setz)      \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    float* q; \
-    mxComplexSingle* z; \
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \
-        *e = "Invalid array argument, mxSINGLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexSingles(a);	   \
-	for (i = 0; i < arraylen; ++i) {	\
-	  setz(p, (ZT) (*z).real, (ZT) (*z).imag);	\
-  	  ++p; ++z; }					\
-      } \
-    else \
-      {				   \
-	q = mxGetSingles(a);	   \
-	for (i = 0; i < arraylen; ++i)	{	\
-	  setz(p, (ZT) (*q), (ZT) 0 );		\
-          ++p; ++q; }			\
-      }						\
-    return array; \
-}
-
-
-#define mxWrapCopyZDef_single(func, T, freal, fimag)	    \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p;	\
-    mxComplexSingle* z; \
-    if( mxIsComplex(a) ) \
-      { \
-	z = mxGetComplexSingles(a);	   \
-	for (i = 0; i < n; ++i)	{		\
-          (*z).real = freal(*q);			\
-	  (*z).imag = fimag(*q);			\
-	  ++z; ++q; 	}			\
-      } \
-    else \
-      {				   \
-	p = mxGetSingles(a);	   \
-	for (i = 0; i < n; ++i)		\
-	  *p++ = freal(*q++);		\
-      } \
-}
-
-
-#define mxWrapReturnZDef_single(func, T, freal, fimag)	      \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    mxComplexSingle* p; \
-    if (!q) { \
-        return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxCOMPLEX); \
-    } else { \
-        mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxCOMPLEX); \
-        p = mxGetComplexSingles(a); \
-        for (i = 0; i < m*n; ++i) {	  \
-          (*p).real = freal(*q);			\
-	  (*p).imag = fimag(*q);			\
-	  ++p; ++q; 	}			\
-        return a; \
-    } \
-}
-
-
+  if (p == 0) *e = "Invalid pointer";
+  return p;
+}
+
+mxArray *mxWrapCreateP_single(void *p, const char *fmt) {
+  if (p == 0) {
+    mxArray *z       = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+    *mxGetSingles(z) = 0;
+    return z;
+  } else {
+    char pbuf[128];
+    sprintf(pbuf, fmt, p);
+    return mxCreateString(pbuf);
+  }
+}
+
+mxArray *mxWrapStrncpy_single(const char *s) {
+  if (s) {
+    return mxCreateString(s);
+  } else {
+    mxArray *z       = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+    *mxGetSingles(z) = 0;
+    return z;
+  }
+}
+
+char *mxWrapGetString_single(const mxArray *a, const char **e) {
+  char *s;
+  mwSize slen;
+  if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) {
+    *e = "Invalid string argument";
+    return NULL;
+  }
+  slen = mxGetM(a) * mxGetN(a) + 1;
+  s    = (char *)mxMalloc(slen);
+  if (mxGetM(a) * mxGetN(a) == 0)
+    *s = 0;
+  else
+    mxGetString(a, s, slen);
+  return s;
+}
+
+float mxWrapGetScalar_single(const mxArray *a, const char **e) {
+  if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a) * mxGetN(a) != 1) {
+    *e = "Invalid scalar argument";
+    return 0;
+  }
+  if (mxIsComplex(a))
+    return (float)(*mxGetComplexSingles(a)).real;
+  else
+    return (float)(*mxGetSingles(a));
+}
+
+#define mxWrapGetArrayDef_single(func, T)                        \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    float *q;                                                    \
+    mxComplexSingle *z;                                          \
+    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) {               \
+      *e = "Invalid array argument, mxSINGLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    if (mxIsComplex(a)) {                                        \
+      z = mxGetComplexSingles(a);                                \
+      for (i = 0; i < arraylen; ++i) *p++ = (T)(*z++).real;      \
+    } else {                                                     \
+      q = mxGetSingles(a);                                       \
+      for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++);           \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyDef_single(func, T)                    \
+  void func(mxArray *a, const T *q, mwSize n) {          \
+    mwIndex i;                                           \
+    float *p;                                            \
+    mxComplexSingle *z;                                  \
+    if (mxIsComplex(a)) {                                \
+      z = mxGetComplexSingles(a);                        \
+      for (i = 0; i < n; ++i) (*z++).real = (float)*q++; \
+      (*z++).imag = 0;                                   \
+    } else {                                             \
+      p = mxGetSingles(a);                               \
+      for (i = 0; i < n; ++i) *p++ = (float)*q++;        \
+    }                                                    \
+  }
+
+#define mxWrapReturnDef_single(func, T)                                 \
+  mxArray *func(const T *q, mwSize m, mwSize n) {                       \
+    mwIndex i;                                                          \
+    float *p;                                                           \
+    if (!q) {                                                           \
+      return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxREAL);       \
+    } else {                                                            \
+      mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxREAL); \
+      p          = mxGetSingles(a);                                     \
+      for (i = 0; i < m * n; ++i) *p++ = (float)*q++;                   \
+      return a;                                                         \
+    }                                                                   \
+  }
+
+#define mxWrapGetScalarZDef_single(func, T, ZT, setz)                                  \
+  void func(T *z, const mxArray *a) {                                                  \
+    if (mxIsComplex(a)) {                                                              \
+      setz(z, (ZT)(*mxGetComplexSingles(a)).real, (ZT)(*mxGetComplexSingles(a)).imag); \
+    } else {                                                                           \
+      setz(z, (ZT)(*mxGetComplexSingles(a)).real, (ZT)0);                              \
+    }                                                                                  \
+  }
+
+#define mxWrapGetArrayZDef_single(func, T, ZT, setz)             \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    float *q;                                                    \
+    mxComplexSingle *z;                                          \
+    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) {               \
+      *e = "Invalid array argument, mxSINGLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    if (mxIsComplex(a)) {                                        \
+      z = mxGetComplexSingles(a);                                \
+      for (i = 0; i < arraylen; ++i) {                           \
+        setz(p, (ZT)(*z).real, (ZT)(*z).imag);                   \
+        ++p;                                                     \
+        ++z;                                                     \
+      }                                                          \
+    } else {                                                     \
+      q = mxGetSingles(a);                                       \
+      for (i = 0; i < arraylen; ++i) {                           \
+        setz(p, (ZT)(*q), (ZT)0);                                \
+        ++p;                                                     \
+        ++q;                                                     \
+      }                                                          \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyZDef_single(func, T, freal, fimag) \
+  void func(mxArray *a, const T *q, mwSize n) {      \
+    mwIndex i;                                       \
+    float *p;                                        \
+    mxComplexSingle *z;                              \
+    if (mxIsComplex(a)) {                            \
+      z = mxGetComplexSingles(a);                    \
+      for (i = 0; i < n; ++i) {                      \
+        (*z).real = freal(*q);                       \
+        (*z).imag = fimag(*q);                       \
+        ++z;                                         \
+        ++q;                                         \
+      }                                              \
+    } else {                                         \
+      p = mxGetSingles(a);                           \
+      for (i = 0; i < n; ++i) *p++ = freal(*q++);    \
+    }                                                \
+  }
+
+#define mxWrapReturnZDef_single(func, T, freal, fimag)                     \
+  mxArray *func(const T *q, mwSize m, mwSize n) {                          \
+    mwIndex i;                                                             \
+    mxComplexSingle *p;                                                    \
+    if (!q) {                                                              \
+      return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxCOMPLEX);       \
+    } else {                                                               \
+      mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxCOMPLEX); \
+      p          = mxGetComplexSingles(a);                                 \
+      for (i = 0; i < m * n; ++i) {                                        \
+        (*p).real = freal(*q);                                             \
+        (*p).imag = fimag(*q);                                             \
+        ++p;                                                               \
+        ++q;                                                               \
+      }                                                                    \
+      return a;                                                            \
+    }                                                                      \
+  }
 
 #else
 
@@ -557,1672 +474,1533 @@ mxArray* func(const T* q, mwSize m, mwSize n) \
  * Support routines for copying data into and out of the MEX stubs, -R2017b
  */
 
-void* mxWrapGetP(const mxArray* a, const char* fmt, const char** e)
-{
-    void* p = 0;
-#ifdef R2008OO
-    mxArray* ap;
-#endif
-    if (mxGetClassID(a) == mxDOUBLE_CLASS && 
-        mxGetM(a)*mxGetN(a) == 1 && *mxGetPr(a) == 0)
-        return p;
-    if (mxIsChar(a)) {
-        char pbuf[128];
-        mxGetString(a, pbuf, sizeof(pbuf));
-        sscanf(pbuf, fmt, &p);
-    } 
+void *mxWrapGetP(const mxArray *a, const char *fmt, const char **e) {
+  void *p = 0;
 #ifdef R2008OO
-    else if (ap = mxGetProperty(a, 0, "mwptr")) {
-        return mxWrapGetP(ap, fmt, e);
-    }
+  mxArray *ap;
 #endif
-    if (p == 0)
-        *e = "Invalid pointer";
+  if (mxGetClassID(a) == mxDOUBLE_CLASS && mxGetM(a) * mxGetN(a) == 1 && *mxGetPr(a) == 0)
     return p;
-}
-
-mxArray* mxWrapCreateP(void* p, const char* fmt)
-{
-    if (p == 0) {
-        mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL);
-        *mxGetPr(z) = 0;
-        return z;
-    } else {
-        char pbuf[128];
-        sprintf(pbuf, fmt, p);
-        return mxCreateString(pbuf);
-    }
-}
-
-mxArray* mxWrapStrncpy(const char* s)
-{
-    if (s) {
-        return mxCreateString(s);
-    } else {
-        mxArray* z = mxCreateDoubleMatrix(1,1, mxREAL);
-        *mxGetPr(z) = 0;
-        return z;
-    }
-}
-
-double mxWrapGetScalar(const mxArray* a, const char** e)
-{
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a)*mxGetN(a) != 1) {
-        *e = "Invalid scalar argument";
-        return 0;
-    }
-    return *mxGetPr(a);
-}
-
-char* mxWrapGetString(const mxArray* a, const char** e)
-{
-    char* s;
-    mwSize slen;
-    if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) {
-        *e = "Invalid string argument";
-        return NULL;
-    }
-    slen = mxGetM(a)*mxGetN(a) + 1;
-    s = (char*) mxMalloc(slen);
-    if (mxGetM(a)*mxGetN(a) == 0)
-        *s = 0;
-    else
-        mxGetString(a, s, slen);
-    return s;
-}
-
-
-#define mxWrapGetArrayDef(func, T) \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    double* q; \
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \
-        *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    q = mxGetPr(a); \
-    for (i = 0; i < arraylen; ++i) \
-        *p++ = (T) (*q++); \
-    return array; \
-}
-
-
-#define mxWrapCopyDef(func, T) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p = mxGetPr(a); \
-    for (i = 0; i < n; ++i) \
-        *p++ = *q++; \
-}
-
-
-#define mxWrapReturnDef(func, T) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    double* p; \
-    if (!q) { \
-        return mxCreateDoubleMatrix(0,0, mxREAL); \
-    } else { \
-        mxArray* a = mxCreateDoubleMatrix(m,n, mxREAL); \
-        p = mxGetPr(a); \
-        for (i = 0; i < m*n; ++i) \
-            *p++ = *q++; \
-        return a; \
-    } \
-}
-
-
-#define mxWrapGetScalarZDef(func, T, ZT, setz) \
-void func(T* z, const mxArray* a) \
-{ \
-    double* pr = mxGetPr(a); \
-    double* pi = mxGetPi(a); \
-    setz(z, (ZT) *pr, (pi ? (ZT) *pi : (ZT) 0)); \
-}
-
-
-#define mxWrapGetArrayZDef(func, T, ZT, setz) \
-T* func(const mxArray* a, const char** e) \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    double* qr; \
-    double* qi; \
-    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) { \
-        *e = "Invalid array argument, mxDOUBLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    qr = mxGetPr(a); \
-    qi = mxGetPi(a); \
-    for (i = 0; i < arraylen; ++i) { \
-        ZT val_qr = *qr++; \
-        ZT val_qi = (qi ? (ZT) *qi++ : (ZT) 0); \
-        setz(p, val_qr, val_qi); \
-        ++p; \
-    } \
-    return array; \
-}
-
-
-#define mxWrapCopyZDef(func, T, real, imag) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    double* pr = mxGetPr(a); \
-    double* pi = mxGetPi(a); \
-    for (i = 0; i < n; ++i) { \
-        *pr++ = real(*q); \
-        *pi++ = imag(*q); \
-        ++q; \
-    } \
-}
-
-
-#define mxWrapReturnZDef(func, T, real, imag) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    double* pr; \
-    double* pi; \
-    if (!q) { \
-        return mxCreateDoubleMatrix(0,0, mxCOMPLEX); \
-    } else { \
-        mxArray* a = mxCreateDoubleMatrix(m,n, mxCOMPLEX); \
-        pr = mxGetPr(a); \
-        pi = mxGetPi(a); \
-        for (i = 0; i < m*n; ++i) { \
-            *pr++ = real(*q); \
-            *pi++ = imag(*q); \
-            ++q; \
-        } \
-        return a; \
-    } \
-}
-
-
-
-
-
-
-void* mxWrapGetP_single(const mxArray* a, const char* fmt, const char** e)
-{
-    void* p = 0;
+  if (mxIsChar(a)) {
+    char pbuf[128];
+    mxGetString(a, pbuf, sizeof(pbuf));
+    sscanf(pbuf, fmt, &p);
+  }
 #ifdef R2008OO
-    mxArray* ap;
+  else if (ap = mxGetProperty(a, 0, "mwptr")) {
+    return mxWrapGetP(ap, fmt, e);
+  }
 #endif
-    if (mxGetClassID(a) == mxSINGLE_CLASS && 
-        mxGetM(a)*mxGetN(a) == 1 && *((float*)mxGetData(a)) == 0)
-        return p;
-    if (mxIsChar(a)) {
-        char pbuf[128];
-        mxGetString(a, pbuf, sizeof(pbuf));
-        sscanf(pbuf, fmt, &p);
-    } 
+  if (p == 0) *e = "Invalid pointer";
+  return p;
+}
+
+mxArray *mxWrapCreateP(void *p, const char *fmt) {
+  if (p == 0) {
+    mxArray *z  = mxCreateDoubleMatrix(1, 1, mxREAL);
+    *mxGetPr(z) = 0;
+    return z;
+  } else {
+    char pbuf[128];
+    sprintf(pbuf, fmt, p);
+    return mxCreateString(pbuf);
+  }
+}
+
+mxArray *mxWrapStrncpy(const char *s) {
+  if (s) {
+    return mxCreateString(s);
+  } else {
+    mxArray *z  = mxCreateDoubleMatrix(1, 1, mxREAL);
+    *mxGetPr(z) = 0;
+    return z;
+  }
+}
+
+double mxWrapGetScalar(const mxArray *a, const char **e) {
+  if (!a || mxGetClassID(a) != mxDOUBLE_CLASS || mxGetM(a) * mxGetN(a) != 1) {
+    *e = "Invalid scalar argument";
+    return 0;
+  }
+  return *mxGetPr(a);
+}
+
+char *mxWrapGetString(const mxArray *a, const char **e) {
+  char *s;
+  mwSize slen;
+  if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) {
+    *e = "Invalid string argument";
+    return NULL;
+  }
+  slen = mxGetM(a) * mxGetN(a) + 1;
+  s    = (char *)mxMalloc(slen);
+  if (mxGetM(a) * mxGetN(a) == 0)
+    *s = 0;
+  else
+    mxGetString(a, s, slen);
+  return s;
+}
+
+#define mxWrapGetArrayDef(func, T)                               \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    double *q;                                                   \
+    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) {               \
+      *e = "Invalid array argument, mxDOUBLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    q        = mxGetPr(a);                                       \
+    for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++);             \
+    return array;                                                \
+  }
+
+#define mxWrapCopyDef(func, T)                  \
+  void func(mxArray *a, const T *q, mwSize n) { \
+    mwIndex i;                                  \
+    double *p = mxGetPr(a);                     \
+    for (i = 0; i < n; ++i) *p++ = *q++;        \
+  }
+
+#define mxWrapReturnDef(func, T)                       \
+  mxArray *func(const T *q, mwSize m, mwSize n) {      \
+    mwIndex i;                                         \
+    double *p;                                         \
+    if (!q) {                                          \
+      return mxCreateDoubleMatrix(0, 0, mxREAL);       \
+    } else {                                           \
+      mxArray *a = mxCreateDoubleMatrix(m, n, mxREAL); \
+      p          = mxGetPr(a);                         \
+      for (i = 0; i < m * n; ++i) *p++ = *q++;         \
+      return a;                                        \
+    }                                                  \
+  }
+
+#define mxWrapGetScalarZDef(func, T, ZT, setz)    \
+  void func(T *z, const mxArray *a) {             \
+    double *pr = mxGetPr(a);                      \
+    double *pi = mxGetPi(a);                      \
+    setz(z, (ZT) * pr, (pi ? (ZT) * pi : (ZT)0)); \
+  }
+
+#define mxWrapGetArrayZDef(func, T, ZT, setz)                    \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    double *qr;                                                  \
+    double *qi;                                                  \
+    if (!a || mxGetClassID(a) != mxDOUBLE_CLASS) {               \
+      *e = "Invalid array argument, mxDOUBLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    qr       = mxGetPr(a);                                       \
+    qi       = mxGetPi(a);                                       \
+    for (i = 0; i < arraylen; ++i) {                             \
+      ZT val_qr = *qr++;                                         \
+      ZT val_qi = (qi ? (ZT) * qi++ : (ZT)0);                    \
+      setz(p, val_qr, val_qi);                                   \
+      ++p;                                                       \
+    }                                                            \
+    return array;                                                \
+  }
+
+#define mxWrapCopyZDef(func, T, real, imag)     \
+  void func(mxArray *a, const T *q, mwSize n) { \
+    mwIndex i;                                  \
+    double *pr = mxGetPr(a);                    \
+    double *pi = mxGetPi(a);                    \
+    for (i = 0; i < n; ++i) {                   \
+      *pr++ = real(*q);                         \
+      *pi++ = imag(*q);                         \
+      ++q;                                      \
+    }                                           \
+  }
+
+#define mxWrapReturnZDef(func, T, real, imag)             \
+  mxArray *func(const T *q, mwSize m, mwSize n) {         \
+    mwIndex i;                                            \
+    double *pr;                                           \
+    double *pi;                                           \
+    if (!q) {                                             \
+      return mxCreateDoubleMatrix(0, 0, mxCOMPLEX);       \
+    } else {                                              \
+      mxArray *a = mxCreateDoubleMatrix(m, n, mxCOMPLEX); \
+      pr         = mxGetPr(a);                            \
+      pi         = mxGetPi(a);                            \
+      for (i = 0; i < m * n; ++i) {                       \
+        *pr++ = real(*q);                                 \
+        *pi++ = imag(*q);                                 \
+        ++q;                                              \
+      }                                                   \
+      return a;                                           \
+    }                                                     \
+  }
+
+void *mxWrapGetP_single(const mxArray *a, const char *fmt, const char **e) {
+  void *p = 0;
 #ifdef R2008OO
-    else if (ap = mxGetProperty(a, 0, "mwptr")) {
-        return mxWrapGetP(ap, fmt, e);
-    }
+  mxArray *ap;
 #endif
-    if (p == 0)
-        *e = "Invalid pointer";
+  if (mxGetClassID(a) == mxSINGLE_CLASS && mxGetM(a) * mxGetN(a) == 1 &&
+      *((float *)mxGetData(a)) == 0)
     return p;
-}
-
-mxArray* mxWrapCreateP_single(void* p, const char* fmt)
-{
-    if (p == 0) {
-        mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL);
-        *((float*)mxGetData(z)) = 0;
-        return z;
-    } else {
-        char pbuf[128];
-        sprintf(pbuf, fmt, p);
-        return mxCreateString(pbuf);
-    }
-}
-mxArray* mxWrapStrncpy_single(const char* s)
-{
-    if (s) {
-        return mxCreateString(s);
-    } else {
-        mxArray* z = mxCreateNumericMatrix(1,1, mxSINGLE_CLASS, mxREAL);
-        *((float*)mxGetData(z)) = 0;
-        return z;
-    }
-}
-
-float mxWrapGetScalar_single(const mxArray* a, const char** e)
-{
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a)*mxGetN(a) != 1) {
-        *e = "Invalid scalar argument";
-        return 0;
-    }
-    return *((float*)mxGetData(a));
-}
-
-char* mxWrapGetString_single(const mxArray* a, const char** e)
-{
-    char* s;
-    mwSize slen;
-    if (!a || (!mxIsChar(a) && mxGetM(a)*mxGetN(a) > 0)) {
-        *e = "Invalid string argument, mxSINGLE_CLASS expected";
-        return NULL;
-    }
-    slen = mxGetM(a)*mxGetN(a) + 1;
-    s = (char*) mxMalloc(slen);
-    if (mxGetM(a)*mxGetN(a) == 0)
-        *s = 0;
-    else
-        mxGetString(a, s, slen);
-    return s;
-}
-
-
-#define mxWrapGetArrayDef_single(func, T) \
-T* func(const mxArray* a, const char** e)     \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    float* q; \
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \
-        *e = "Invalid array argument, mxSINGLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    q = (float*) mxGetData(a);	   \
-    for (i = 0; i < arraylen; ++i) \
-        *p++ = (T) (*q++); \
-    return array; \
-}
-
-
-#define mxWrapCopyDef_single(func, T) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p = (float*) mxGetData(a);		\
-    for (i = 0; i < n; ++i) \
-        *p++ = *q++; \
-}
-
-
-#define mxWrapReturnDef_single(func, T) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    float* p; \
-    if (!q) { \
-      return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxREAL); \
-    } else { \
-        mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxREAL);\
-        p = (float*) mxGetData(a);				\
-        for (i = 0; i < m*n; ++i) \
-            *p++ = *q++; \
-        return a; \
-    } \
-}
-
+  if (mxIsChar(a)) {
+    char pbuf[128];
+    mxGetString(a, pbuf, sizeof(pbuf));
+    sscanf(pbuf, fmt, &p);
+  }
+#ifdef R2008OO
+  else if (ap = mxGetProperty(a, 0, "mwptr")) {
+    return mxWrapGetP(ap, fmt, e);
+  }
+#endif
+  if (p == 0) *e = "Invalid pointer";
+  return p;
+}
+
+mxArray *mxWrapCreateP_single(void *p, const char *fmt) {
+  if (p == 0) {
+    mxArray *z               = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+    *((float *)mxGetData(z)) = 0;
+    return z;
+  } else {
+    char pbuf[128];
+    sprintf(pbuf, fmt, p);
+    return mxCreateString(pbuf);
+  }
+}
+mxArray *mxWrapStrncpy_single(const char *s) {
+  if (s) {
+    return mxCreateString(s);
+  } else {
+    mxArray *z               = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
+    *((float *)mxGetData(z)) = 0;
+    return z;
+  }
+}
+
+float mxWrapGetScalar_single(const mxArray *a, const char **e) {
+  if (!a || mxGetClassID(a) != mxSINGLE_CLASS || mxGetM(a) * mxGetN(a) != 1) {
+    *e = "Invalid scalar argument";
+    return 0;
+  }
+  return *((float *)mxGetData(a));
+}
+
+char *mxWrapGetString_single(const mxArray *a, const char **e) {
+  char *s;
+  mwSize slen;
+  if (!a || (!mxIsChar(a) && mxGetM(a) * mxGetN(a) > 0)) {
+    *e = "Invalid string argument, mxSINGLE_CLASS expected";
+    return NULL;
+  }
+  slen = mxGetM(a) * mxGetN(a) + 1;
+  s    = (char *)mxMalloc(slen);
+  if (mxGetM(a) * mxGetN(a) == 0)
+    *s = 0;
+  else
+    mxGetString(a, s, slen);
+  return s;
+}
+
+#define mxWrapGetArrayDef_single(func, T)                        \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    float *q;                                                    \
+    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) {               \
+      *e = "Invalid array argument, mxSINGLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    q        = (float *)mxGetData(a);                            \
+    for (i = 0; i < arraylen; ++i) *p++ = (T)(*q++);             \
+    return array;                                                \
+  }
+
+#define mxWrapCopyDef_single(func, T)           \
+  void func(mxArray *a, const T *q, mwSize n) { \
+    mwIndex i;                                  \
+    float *p = (float *)mxGetData(a);           \
+    for (i = 0; i < n; ++i) *p++ = *q++;        \
+  }
+
+#define mxWrapReturnDef_single(func, T)                                 \
+  mxArray *func(const T *q, mwSize m, mwSize n) {                       \
+    mwIndex i;                                                          \
+    float *p;                                                           \
+    if (!q) {                                                           \
+      return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxREAL);       \
+    } else {                                                            \
+      mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxREAL); \
+      p          = (float *)mxGetData(a);                               \
+      for (i = 0; i < m * n; ++i) *p++ = *q++;                          \
+      return a;                                                         \
+    }                                                                   \
+  }
 
 #define mxWrapGetScalarZDef_single(func, T, ZT, setz) \
-void func(T* z, const mxArray* a) \
-{ \
-    float* pr = (float*) mxGetData(a);		\
-    float* pi = (float*) mxGetImagData(a);		 \
-    setz(z, (ZT) *pr, (pi ? (ZT) *pi : (ZT) 0)); \
-}
-
-
-#define mxWrapGetArrayZDef_single(func, T, ZT, setz) \
-T* func(const mxArray* a, const char** e) \
-{ \
-    T* array; \
-    mwSize arraylen; \
-    mwIndex i; \
-    T* p; \
-    float* qr; \
-    float* qi; \
-    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) { \
-        *e = "Invalid array argument, mxSINGLE_CLASS expected"; \
-        return 0; \
-    } \
-    arraylen = mxGetM(a)*mxGetN(a); \
-    array = (T*) mxMalloc(mxGetM(a)*mxGetN(a) * sizeof(T)); \
-    p = array; \
-    qr = (float*) mxGetData(a);			\
-    qi = (float*) mxGetImagData(a);			\
-    for (i = 0; i < arraylen; ++i) { \
-        ZT val_qr = *qr++; \
-        ZT val_qi = (qi ? (ZT) *qi++ : (ZT) 0); \
-        setz(p, val_qr, val_qi); \
-        ++p; \
-    } \
-    return array; \
-}
-
+  void func(T *z, const mxArray *a) {                 \
+    float *pr = (float *)mxGetData(a);                \
+    float *pi = (float *)mxGetImagData(a);            \
+    setz(z, (ZT) * pr, (pi ? (ZT) * pi : (ZT)0));     \
+  }
+
+#define mxWrapGetArrayZDef_single(func, T, ZT, setz)             \
+  T *func(const mxArray *a, const char **e) {                    \
+    T *array;                                                    \
+    mwSize arraylen;                                             \
+    mwIndex i;                                                   \
+    T *p;                                                        \
+    float *qr;                                                   \
+    float *qi;                                                   \
+    if (!a || mxGetClassID(a) != mxSINGLE_CLASS) {               \
+      *e = "Invalid array argument, mxSINGLE_CLASS expected";    \
+      return 0;                                                  \
+    }                                                            \
+    arraylen = mxGetM(a) * mxGetN(a);                            \
+    array    = (T *)mxMalloc(mxGetM(a) * mxGetN(a) * sizeof(T)); \
+    p        = array;                                            \
+    qr       = (float *)mxGetData(a);                            \
+    qi       = (float *)mxGetImagData(a);                        \
+    for (i = 0; i < arraylen; ++i) {                             \
+      ZT val_qr = *qr++;                                         \
+      ZT val_qi = (qi ? (ZT) * qi++ : (ZT)0);                    \
+      setz(p, val_qr, val_qi);                                   \
+      ++p;                                                       \
+    }                                                            \
+    return array;                                                \
+  }
 
 #define mxWrapCopyZDef_single(func, T, real, imag) \
-void func(mxArray* a, const T* q, mwSize n) \
-{ \
-    mwIndex i; \
-    float* pr = (float*) mxGetData(a);		\
-    float* pi = (float*) mxGetImagData(a);		\
-    for (i = 0; i < n; ++i) { \
-        *pr++ = real(*q); \
-        *pi++ = imag(*q); \
-        ++q; \
-    } \
-}
-
-
-#define mxWrapReturnZDef_single(func, T, real, imag) \
-mxArray* func(const T* q, mwSize m, mwSize n) \
-{ \
-    mwIndex i; \
-    float* pr; \
-    float* pi; \
-    if (!q) { \
-      return mxCreateNumericMatrix(0,0, mxSINGLE_CLASS, mxCOMPLEX); \
-    } else { \
-        mxArray* a = mxCreateNumericMatrix(m,n, mxSINGLE_CLASS, mxCOMPLEX);\
-        pr = (float*) mxGetData(a);					\
-        pi = (float*) mxGetImagData(a);					\
-        for (i = 0; i < m*n; ++i) { \
-            *pr++ = real(*q); \
-            *pi++ = imag(*q); \
-            ++q; \
-        } \
-        return a; \
-    } \
-}
-
-
-
-
+  void func(mxArray *a, const T *q, mwSize n) {    \
+    mwIndex i;                                     \
+    float *pr = (float *)mxGetData(a);             \
+    float *pi = (float *)mxGetImagData(a);         \
+    for (i = 0; i < n; ++i) {                      \
+      *pr++ = real(*q);                            \
+      *pi++ = imag(*q);                            \
+      ++q;                                         \
+    }                                              \
+  }
+
+#define mxWrapReturnZDef_single(func, T, real, imag)                       \
+  mxArray *func(const T *q, mwSize m, mwSize n) {                          \
+    mwIndex i;                                                             \
+    float *pr;                                                             \
+    float *pi;                                                             \
+    if (!q) {                                                              \
+      return mxCreateNumericMatrix(0, 0, mxSINGLE_CLASS, mxCOMPLEX);       \
+    } else {                                                               \
+      mxArray *a = mxCreateNumericMatrix(m, n, mxSINGLE_CLASS, mxCOMPLEX); \
+      pr         = (float *)mxGetData(a);                                  \
+      pi         = (float *)mxGetImagData(a);                              \
+      for (i = 0; i < m * n; ++i) {                                        \
+        *pr++ = real(*q);                                                  \
+        *pi++ = imag(*q);                                                  \
+        ++q;                                                               \
+      }                                                                    \
+      return a;                                                            \
+    }                                                                      \
+  }
 
 #endif
 
 #include <complex>
 
 typedef std::complex<double> dcomplex;
-#define real_dcomplex(z) std::real(z)
-#define imag_dcomplex(z) std::imag(z)
-#define setz_dcomplex(z,r,i)  *z = dcomplex(r,i)
+#define real_dcomplex(z)       std::real(z)
+#define imag_dcomplex(z)       std::imag(z)
+#define setz_dcomplex(z, r, i) *z = dcomplex(r, i)
 
 typedef std::complex<float> fcomplex;
-#define real_fcomplex(z) std::real(z)
-#define imag_fcomplex(z) std::imag(z)
-#define setz_fcomplex(z,r,i)  *z = fcomplex(r,i)
-
- #include <finufft.h>
- #include <mex.h>
- #include <iostream>
- #include <cstring>
- #include <math.h>
- void copy_finufft_opts(const mxArray* om, finufft_opts *oc) {
-   if(!mxIsStruct(om))
-     mexErrMsgIdAndTxt("FINUFFT:inputNotStruct","opts input must be a structure.");
-   mwIndex idx = 0;
-   int ifield, nfields;
-   const char **fname;
-   nfields = mxGetNumberOfFields(om);
-   fname = (const char**)mxCalloc(nfields, sizeof(*fname));
-   for(ifield=0; ifield<nfields; ifield++) {
-     fname[ifield] = mxGetFieldNameByNumber(om,ifield);
-     if (strcmp(fname[ifield],"debug") == 0) {
-       oc->debug = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_debug") == 0) {
-       oc->spread_debug = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_sort") == 0) {
-       oc->spread_sort = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_kerevalmeth") == 0) {
-       oc->spread_kerevalmeth = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_kerpad") == 0) {
-       oc->spread_kerpad = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"fftw") == 0) {
-       oc->fftw = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"modeord") == 0) {
-       oc->modeord = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"upsampfac") == 0) {
-       oc->upsampfac = (double)*mxGetPr(mxGetFieldByNumber(om,idx,ifield));
-     }
-     else if (strcmp(fname[ifield],"spread_thread") == 0) {
-       oc->spread_thread = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"maxbatchsize") == 0) {
-       oc->maxbatchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"nthreads") == 0) {
-       oc->nthreads = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_nthr_atomic") == 0) {
-       oc->spread_nthr_atomic = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else if (strcmp(fname[ifield],"spread_max_sp_size") == 0) {
-       oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om,idx,ifield)));
-     }
-     else
-       continue;
-   }
-   mxFree(fname);
- }
- void finufft_mex_setup() {
-   /* Forces MATLAB to properly initialize their FFTW library. */
-   mexEvalString("fft(1:8);");
- }
-
-
+#define real_fcomplex(z)       std::real(z)
+#define imag_fcomplex(z)       std::imag(z)
+#define setz_fcomplex(z, r, i) *z = fcomplex(r, i)
+
+#include <cstring>
+#include <finufft.h>
+#include <iostream>
+#include <math.h>
+#include <mex.h>
+void copy_finufft_opts(const mxArray *om, finufft_opts *oc) {
+  if (!mxIsStruct(om))
+    mexErrMsgIdAndTxt("FINUFFT:inputNotStruct", "opts input must be a structure.");
+  mwIndex idx = 0;
+  int ifield, nfields;
+  const char **fname;
+  nfields = mxGetNumberOfFields(om);
+  fname   = (const char **)mxCalloc(nfields, sizeof(*fname));
+  for (ifield = 0; ifield < nfields; ifield++) {
+    fname[ifield] = mxGetFieldNameByNumber(om, ifield);
+    if (strcmp(fname[ifield], "debug") == 0) {
+      oc->debug = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_debug") == 0) {
+      oc->spread_debug = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_sort") == 0) {
+      oc->spread_sort = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_kerevalmeth") == 0) {
+      oc->spread_kerevalmeth = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_kerpad") == 0) {
+      oc->spread_kerpad = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "fftw") == 0) {
+      oc->fftw = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "modeord") == 0) {
+      oc->modeord = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "upsampfac") == 0) {
+      oc->upsampfac = (double)*mxGetPr(mxGetFieldByNumber(om, idx, ifield));
+    } else if (strcmp(fname[ifield], "spread_thread") == 0) {
+      oc->spread_thread = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "maxbatchsize") == 0) {
+      oc->maxbatchsize = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "nthreads") == 0) {
+      oc->nthreads = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_nthr_atomic") == 0) {
+      oc->spread_nthr_atomic = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else if (strcmp(fname[ifield], "spread_max_sp_size") == 0) {
+      oc->spread_max_sp_size = (int)round(*mxGetPr(mxGetFieldByNumber(om, idx, ifield)));
+    } else
+      continue;
+  }
+  mxFree(fname);
+}
+void finufft_mex_setup() {
+  /* Forces MATLAB to properly initialize their FFTW library. */
+  mexEvalString("fft(1:8);");
+}
 
 /* Array copier definitions */
-mxWrapGetArrayDef(mxWrapGetArray_bool, bool)
-mxWrapCopyDef    (mxWrapCopy_bool,     bool)
-mxWrapReturnDef  (mxWrapReturn_bool,   bool)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_bool, bool)
-mxWrapCopyDef_single    (mxWrapCopy_single_bool,     bool)
-mxWrapReturnDef_single  (mxWrapReturn_single_bool,   bool)
-mxWrapGetArrayDef(mxWrapGetArray_char, char)
-mxWrapCopyDef    (mxWrapCopy_char,     char)
-mxWrapReturnDef  (mxWrapReturn_char,   char)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_char, char)
-mxWrapCopyDef_single    (mxWrapCopy_single_char,     char)
-mxWrapReturnDef_single  (mxWrapReturn_single_char,   char)
-mxWrapGetArrayDef(mxWrapGetArray_double, double)
-mxWrapCopyDef    (mxWrapCopy_double,     double)
-mxWrapReturnDef  (mxWrapReturn_double,   double)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_double, double)
-mxWrapCopyDef_single    (mxWrapCopy_single_double,     double)
-mxWrapReturnDef_single  (mxWrapReturn_single_double,   double)
-mxWrapGetArrayDef(mxWrapGetArray_float, float)
-mxWrapCopyDef    (mxWrapCopy_float,     float)
-mxWrapReturnDef  (mxWrapReturn_float,   float)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_float, float)
-mxWrapCopyDef_single    (mxWrapCopy_single_float,     float)
-mxWrapReturnDef_single  (mxWrapReturn_single_float,   float)
-mxWrapGetArrayDef(mxWrapGetArray_int, int)
-mxWrapCopyDef    (mxWrapCopy_int,     int)
-mxWrapReturnDef  (mxWrapReturn_int,   int)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_int, int)
-mxWrapCopyDef_single    (mxWrapCopy_single_int,     int)
-mxWrapReturnDef_single  (mxWrapReturn_single_int,   int)
-mxWrapGetArrayDef(mxWrapGetArray_int64_t, int64_t)
-mxWrapCopyDef    (mxWrapCopy_int64_t,     int64_t)
-mxWrapReturnDef  (mxWrapReturn_int64_t,   int64_t)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_int64_t, int64_t)
-mxWrapCopyDef_single    (mxWrapCopy_single_int64_t,     int64_t)
-mxWrapReturnDef_single  (mxWrapReturn_single_int64_t,   int64_t)
-mxWrapGetArrayDef(mxWrapGetArray_long, long)
-mxWrapCopyDef    (mxWrapCopy_long,     long)
-mxWrapReturnDef  (mxWrapReturn_long,   long)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_long, long)
-mxWrapCopyDef_single    (mxWrapCopy_single_long,     long)
-mxWrapReturnDef_single  (mxWrapReturn_single_long,   long)
-mxWrapGetArrayDef(mxWrapGetArray_ptrdiff_t, ptrdiff_t)
-mxWrapCopyDef    (mxWrapCopy_ptrdiff_t,     ptrdiff_t)
-mxWrapReturnDef  (mxWrapReturn_ptrdiff_t,   ptrdiff_t)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_ptrdiff_t, ptrdiff_t)
-mxWrapCopyDef_single    (mxWrapCopy_single_ptrdiff_t,     ptrdiff_t)
-mxWrapReturnDef_single  (mxWrapReturn_single_ptrdiff_t,   ptrdiff_t)
-mxWrapGetArrayDef(mxWrapGetArray_short, short)
-mxWrapCopyDef    (mxWrapCopy_short,     short)
-mxWrapReturnDef  (mxWrapReturn_short,   short)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_short, short)
-mxWrapCopyDef_single    (mxWrapCopy_single_short,     short)
-mxWrapReturnDef_single  (mxWrapReturn_single_short,   short)
-mxWrapGetArrayDef(mxWrapGetArray_size_t, size_t)
-mxWrapCopyDef    (mxWrapCopy_size_t,     size_t)
-mxWrapReturnDef  (mxWrapReturn_size_t,   size_t)
-mxWrapGetArrayDef_single(mxWrapGetArray_single_size_t, size_t)
-mxWrapCopyDef_single    (mxWrapCopy_single_size_t,     size_t)
-mxWrapReturnDef_single  (mxWrapReturn_single_size_t,   size_t)
-mxWrapGetScalarZDef(mxWrapGetScalar_fcomplex, fcomplex,
-                    float, setz_fcomplex)
-mxWrapGetArrayZDef (mxWrapGetArray_fcomplex, fcomplex,
-                    float, setz_fcomplex)
-mxWrapCopyZDef     (mxWrapCopy_fcomplex, fcomplex,
-                    real_fcomplex, imag_fcomplex)
-mxWrapReturnZDef   (mxWrapReturn_fcomplex, fcomplex,
-                    real_fcomplex, imag_fcomplex)
-mxWrapGetScalarZDef_single(mxWrapGetScalar_single_fcomplex, fcomplex,
-                    float, setz_fcomplex)
-mxWrapGetArrayZDef_single (mxWrapGetArray_single_fcomplex, fcomplex,
-                    float, setz_fcomplex)
-mxWrapCopyZDef_single     (mxWrapCopy_single_fcomplex, fcomplex,
-                    real_fcomplex, imag_fcomplex)
-mxWrapReturnZDef_single   (mxWrapReturn_single_fcomplex, fcomplex,
-                    real_fcomplex, imag_fcomplex)
-mxWrapGetScalarZDef(mxWrapGetScalar_dcomplex, dcomplex,
-                    double, setz_dcomplex)
-mxWrapGetArrayZDef (mxWrapGetArray_dcomplex, dcomplex,
-                    double, setz_dcomplex)
-mxWrapCopyZDef     (mxWrapCopy_dcomplex, dcomplex,
-                    real_dcomplex, imag_dcomplex)
-mxWrapReturnZDef   (mxWrapReturn_dcomplex, dcomplex,
-                    real_dcomplex, imag_dcomplex)
-mxWrapGetScalarZDef_single(mxWrapGetScalar_single_dcomplex, dcomplex,
-                    double, setz_dcomplex)
-mxWrapGetArrayZDef_single (mxWrapGetArray_single_dcomplex, dcomplex,
-                    double, setz_dcomplex)
-mxWrapCopyZDef_single     (mxWrapCopy_single_dcomplex, dcomplex,
-                    real_dcomplex, imag_dcomplex)
-mxWrapReturnZDef_single   (mxWrapReturn_single_dcomplex, dcomplex,
-                    real_dcomplex, imag_dcomplex)
-
-/* ---- finufft.mw: 166 ----
- * finufft_mex_setup();
- */
-static const char* stubids1_ = "finufft_mex_setup()";
-
-void mexStub1(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    if (mexprofrecord_)
-        mexprofrecord_[1]++;
-    finufft_mex_setup();
+mxWrapGetArrayDef(mxWrapGetArray_bool, bool) mxWrapCopyDef(mxWrapCopy_bool, bool) mxWrapReturnDef(
+    mxWrapReturn_bool,
+    bool) mxWrapGetArrayDef_single(mxWrapGetArray_single_bool,
+                                   bool) mxWrapCopyDef_single(mxWrapCopy_single_bool,
+                                                              bool) mxWrapReturnDef_single(mxWrapReturn_single_bool,
+                                                                                           bool)
+    mxWrapGetArrayDef(mxWrapGetArray_char, char) mxWrapCopyDef(mxWrapCopy_char, char) mxWrapReturnDef(
+        mxWrapReturn_char,
+        char) mxWrapGetArrayDef_single(mxWrapGetArray_single_char,
+                                       char) mxWrapCopyDef_single(mxWrapCopy_single_char,
+                                                                  char)
+        mxWrapReturnDef_single(mxWrapReturn_single_char, char) mxWrapGetArrayDef(
+            mxWrapGetArray_double,
+            double) mxWrapCopyDef(mxWrapCopy_double,
+                                  double) mxWrapReturnDef(mxWrapReturn_double,
+                                                          double) mxWrapGetArrayDef_single(mxWrapGetArray_single_double,
+                                                                                           double)
+            mxWrapCopyDef_single(mxWrapCopy_single_double, double) mxWrapReturnDef_single(
+                mxWrapReturn_single_double,
+                double) mxWrapGetArrayDef(mxWrapGetArray_float,
+                                          float) mxWrapCopyDef(mxWrapCopy_float,
+                                                               float) mxWrapReturnDef(mxWrapReturn_float,
+                                                                                      float)
+                mxWrapGetArrayDef_single(mxWrapGetArray_single_float, float) mxWrapCopyDef_single(
+                    mxWrapCopy_single_float,
+                    float) mxWrapReturnDef_single(mxWrapReturn_single_float,
+                                                  float) mxWrapGetArrayDef(mxWrapGetArray_int,
+                                                                           int)
+                    mxWrapCopyDef(mxWrapCopy_int, int) mxWrapReturnDef(mxWrapReturn_int, int) mxWrapGetArrayDef_single(
+                        mxWrapGetArray_single_int,
+                        int) mxWrapCopyDef_single(mxWrapCopy_single_int,
+                                                  int) mxWrapReturnDef_single(mxWrapReturn_single_int,
+                                                                              int) mxWrapGetArrayDef(mxWrapGetArray_int64_t,
+                                                                                                     int64_t)
+                        mxWrapCopyDef(mxWrapCopy_int64_t, int64_t) mxWrapReturnDef(mxWrapReturn_int64_t, int64_t) mxWrapGetArrayDef_single(
+                            mxWrapGetArray_single_int64_t,
+                            int64_t) mxWrapCopyDef_single(mxWrapCopy_single_int64_t,
+                                                          int64_t) mxWrapReturnDef_single(mxWrapReturn_single_int64_t,
+                                                                                          int64_t)
+                            mxWrapGetArrayDef(mxWrapGetArray_long, long) mxWrapCopyDef(mxWrapCopy_long, long) mxWrapReturnDef(
+                                mxWrapReturn_long,
+                                long) mxWrapGetArrayDef_single(mxWrapGetArray_single_long,
+                                                               long) mxWrapCopyDef_single(mxWrapCopy_single_long,
+                                                                                          long)
+                                mxWrapReturnDef_single(mxWrapReturn_single_long, long) mxWrapGetArrayDef(
+                                    mxWrapGetArray_ptrdiff_t,
+                                    ptrdiff_t) mxWrapCopyDef(mxWrapCopy_ptrdiff_t,
+                                                             ptrdiff_t) mxWrapReturnDef(mxWrapReturn_ptrdiff_t, ptrdiff_t)
+                                    mxWrapGetArrayDef_single(mxWrapGetArray_single_ptrdiff_t, ptrdiff_t) mxWrapCopyDef_single(
+                                        mxWrapCopy_single_ptrdiff_t,
+                                        ptrdiff_t) mxWrapReturnDef_single(mxWrapReturn_single_ptrdiff_t,
+                                                                          ptrdiff_t)
+                                        mxWrapGetArrayDef(mxWrapGetArray_short, short) mxWrapCopyDef(
+                                            mxWrapCopy_short,
+                                            short) mxWrapReturnDef(mxWrapReturn_short,
+                                                                   short) mxWrapGetArrayDef_single(mxWrapGetArray_single_short,
+                                                                                                   short)
+                                            mxWrapCopyDef_single(mxWrapCopy_single_short, short) mxWrapReturnDef_single(
+                                                mxWrapReturn_single_short,
+                                                short) mxWrapGetArrayDef(mxWrapGetArray_size_t,
+                                                                         size_t) mxWrapCopyDef(mxWrapCopy_size_t, size_t)
+                                                mxWrapReturnDef(mxWrapReturn_size_t, size_t) mxWrapGetArrayDef_single(
+                                                    mxWrapGetArray_single_size_t,
+                                                    size_t) mxWrapCopyDef_single(mxWrapCopy_single_size_t, size_t)
+                                                    mxWrapReturnDef_single(mxWrapReturn_single_size_t, size_t) mxWrapGetScalarZDef(
+                                                        mxWrapGetScalar_fcomplex,
+                                                        fcomplex, float,
+                                                        setz_fcomplex) mxWrapGetArrayZDef(mxWrapGetArray_fcomplex, fcomplex, float, setz_fcomplex)
+                                                        mxWrapCopyZDef(mxWrapCopy_fcomplex, fcomplex, real_fcomplex, imag_fcomplex) mxWrapReturnZDef(
+                                                            mxWrapReturn_fcomplex,
+                                                            fcomplex, real_fcomplex,
+                                                            imag_fcomplex)
+                                                            mxWrapGetScalarZDef_single(
+                                                                mxWrapGetScalar_single_fcomplex,
+                                                                fcomplex, float,
+                                                                setz_fcomplex) mxWrapGetArrayZDef_single(mxWrapGetArray_single_fcomplex,
+                                                                                                         fcomplex,
+                                                                                                         float, setz_fcomplex)
+                                                                mxWrapCopyZDef_single(
+                                                                    mxWrapCopy_single_fcomplex,
+                                                                    fcomplex,
+                                                                    real_fcomplex,
+                                                                    imag_fcomplex)
+                                                                    mxWrapReturnZDef_single(
+                                                                        mxWrapReturn_single_fcomplex,
+                                                                        fcomplex,
+                                                                        real_fcomplex,
+                                                                        imag_fcomplex) mxWrapGetScalarZDef(mxWrapGetScalar_dcomplex,
+                                                                                                           dcomplex,
+                                                                                                           double,
+                                                                                                           setz_dcomplex)
+                                                                        mxWrapGetArrayZDef(
+                                                                            mxWrapGetArray_dcomplex,
+                                                                            dcomplex,
+                                                                            double,
+                                                                            setz_dcomplex)
+                                                                            mxWrapCopyZDef(
+                                                                                mxWrapCopy_dcomplex,
+                                                                                dcomplex,
+                                                                                real_dcomplex,
+                                                                                imag_dcomplex)
+                                                                                mxWrapReturnZDef(
+                                                                                    mxWrapReturn_dcomplex,
+                                                                                    dcomplex,
+                                                                                    real_dcomplex,
+                                                                                    imag_dcomplex)
+                                                                                    mxWrapGetScalarZDef_single(
+                                                                                        mxWrapGetScalar_single_dcomplex,
+                                                                                        dcomplex,
+                                                                                        double,
+                                                                                        setz_dcomplex)
+                                                                                        mxWrapGetArrayZDef_single(
+                                                                                            mxWrapGetArray_single_dcomplex,
+                                                                                            dcomplex,
+                                                                                            double,
+                                                                                            setz_dcomplex)
+                                                                                            mxWrapCopyZDef_single(
+                                                                                                mxWrapCopy_single_dcomplex,
+                                                                                                dcomplex,
+                                                                                                real_dcomplex,
+                                                                                                imag_dcomplex)
+                                                                                                mxWrapReturnZDef_single(
+                                                                                                    mxWrapReturn_single_dcomplex,
+                                                                                                    dcomplex,
+                                                                                                    real_dcomplex,
+                                                                                                    imag_dcomplex)
+
+    /* ---- finufft.mw: 166 ----
+     * finufft_mex_setup();
+     */
+    static const char *stubids1_ = "finufft_mex_setup()";
+
+void mexStub1(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  if (mexprofrecord_) mexprofrecord_[1]++;
+  finufft_mex_setup();
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 167 ----
  * finufft_opts* o = new();
  */
-static const char* stubids2_ = "o finufft_opts* = new()";
+static const char *stubids2_ = "o finufft_opts* = new()";
 
-void mexStub2(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_opts*  out0_=0; /* o          */
+void mexStub2(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_opts *out0_     = 0; /* o          */
 
-    if (mexprofrecord_)
-        mexprofrecord_[2]++;
-    out0_ = new finufft_opts();
-    plhs[0] = mxWrapCreateP(out0_, "finufft_opts:%p");
+  if (mexprofrecord_) mexprofrecord_[2]++;
+  out0_   = new finufft_opts();
+  plhs[0] = mxWrapCreateP(out0_, "finufft_opts:%p");
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 169 ----
  * finufft_plan* p = new();
  */
-static const char* stubids3_ = "o finufft_plan* = new()";
+static const char *stubids3_ = "o finufft_plan* = new()";
 
-void mexStub3(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  out0_=0; /* p          */
+void mexStub3(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *out0_     = 0; /* p          */
 
-    if (mexprofrecord_)
-        mexprofrecord_[3]++;
-    out0_ = new finufft_plan();
-    plhs[0] = mxWrapCreateP(out0_, "finufft_plan:%p");
+  if (mexprofrecord_) mexprofrecord_[3]++;
+  out0_   = new finufft_plan();
+  plhs[0] = mxWrapCreateP(out0_, "finufft_plan:%p");
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 170 ----
  * finufft_default_opts(finufft_opts* o);
  */
-static const char* stubids4_ = "finufft_default_opts(i finufft_opts*)";
+static const char *stubids4_ = "finufft_default_opts(i finufft_opts*)";
 
-void mexStub4(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_opts*  in0_ =0; /* o          */
+void mexStub4(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_opts *in0_      = 0; /* o          */
 
-    in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[4]++;
-    finufft_default_opts(in0_);
+  in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[4]++;
+  finufft_default_opts(in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 172 ----
  * finufftf_plan* p = new();
  */
-static const char* stubids5_ = "o finufftf_plan* = new()";
+static const char *stubids5_ = "o finufftf_plan* = new()";
 
-void mexStub5(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  out0_=0; /* p          */
+void mexStub5(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *out0_    = 0; /* p          */
 
-    if (mexprofrecord_)
-        mexprofrecord_[5]++;
-    out0_ = new finufftf_plan();
-    plhs[0] = mxWrapCreateP(out0_, "finufftf_plan:%p");
+  if (mexprofrecord_) mexprofrecord_[5]++;
+  out0_   = new finufftf_plan();
+  plhs[0] = mxWrapCreateP(out0_, "finufftf_plan:%p");
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 173 ----
  * finufftf_default_opts(finufft_opts* o);
  */
-static const char* stubids6_ = "finufftf_default_opts(i finufft_opts*)";
+static const char *stubids6_ = "finufftf_default_opts(i finufft_opts*)";
 
-void mexStub6(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_opts*  in0_ =0; /* o          */
+void mexStub6(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_opts *in0_      = 0; /* o          */
 
-    in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[6]++;
-    finufftf_default_opts(in0_);
+  in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[6]++;
+  finufftf_default_opts(in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 184 ----
  * copy_finufft_opts(mxArray opts, finufft_opts* o);
  */
-static const char* stubids7_ = "copy_finufft_opts(i mxArray, i finufft_opts*)";
-
-void mexStub7(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    const mxArray*  in0_;    /* opts       */
-    finufft_opts*  in1_ =0; /* o          */
-
-    in0_ = prhs[0];
-    in1_ = (finufft_opts*) mxWrapGetP(prhs[1], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[7]++;
-    copy_finufft_opts(in0_, in1_);
+static const char *stubids7_ = "copy_finufft_opts(i mxArray, i finufft_opts*)";
+
+void mexStub7(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  const mxArray *in0_;    /* opts       */
+  finufft_opts *in1_ = 0; /* o          */
+
+  in0_ = prhs[0];
+  in1_ = (finufft_opts *)mxWrapGetP(prhs[1], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[7]++;
+  copy_finufft_opts(in0_, in1_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 187 ----
- * int ier = finufft_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, double tol, finufft_plan* plan, finufft_opts* o);
+ * int ier = finufft_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int
+ * n_trans, double tol, finufft_plan* plan, finufft_opts* o);
  */
-static const char* stubids8_ = "o int = finufft_makeplan(i int, i int, i int64_t[x], i int, i int, i double, i finufft_plan*, i finufft_opts*)";
-
-void mexStub8(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    int         in0_;    /* type       */
-    int         in1_;    /* dim        */
-    int64_t*    in2_ =0; /* n_modes    */
-    int         in3_;    /* iflag      */
-    int         in4_;    /* n_trans    */
-    double      in5_;    /* tol        */
-    finufft_plan*  in6_ =0; /* plan       */
-    finufft_opts*  in7_ =0; /* o          */
-    int         out0_;   /* ier        */
-    mwSize      dim8_;   /* 3          */
-
-    dim8_ = (mwSize) mxWrapGetScalar(prhs[8], &mw_err_txt_);
-
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != dim8_) {
-        mw_err_txt_ = "Bad argument size: n_modes";        goto mw_err_label;
-    }
-
-    if( mxGetClassID(prhs[0]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in0_ = (int) mxWrapGetScalar(prhs[0], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in1_ = (int) mxWrapGetScalar(prhs[1], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) {
-        in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in2_ = NULL;
-    if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+static const char *stubids8_ = "o int = finufft_makeplan(i int, i int, i int64_t[x], i "
+                               "int, i int, i double, i finufft_plan*, i finufft_opts*)";
+
+void mexStub8(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  int in0_;               /* type       */
+  int in1_;               /* dim        */
+  int64_t *in2_ = 0;      /* n_modes    */
+  int in3_;               /* iflag      */
+  int in4_;               /* n_trans    */
+  double in5_;            /* tol        */
+  finufft_plan *in6_ = 0; /* plan       */
+  finufft_opts *in7_ = 0; /* o          */
+  int out0_;              /* ier        */
+  mwSize dim8_;           /* 3          */
+
+  dim8_ = (mwSize)mxWrapGetScalar(prhs[8], &mw_err_txt_);
+
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != dim8_) {
+    mw_err_txt_ = "Bad argument size: n_modes";
+    goto mw_err_label;
+  }
+
+  if (mxGetClassID(prhs[0]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in0_ = (int)mxWrapGetScalar(prhs[0], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in1_ = (int)mxWrapGetScalar(prhs[1], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) {
+    in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_);
     if (mw_err_txt_) goto mw_err_label;
-    in3_ = (int) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in4_ = (int) mxWrapGetScalar(prhs[4], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in5_ = (double) mxWrapGetScalar(prhs[5], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    in6_ = (finufft_plan*) mxWrapGetP(prhs[6], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    in7_ = (finufft_opts*) mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[8]++;
-    out0_ = finufft_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_);
+  } else
+    in2_ = NULL;
+  if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in3_ = (int)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in4_ = (int)mxWrapGetScalar(prhs[4], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in5_ = (double)mxWrapGetScalar(prhs[5], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  in6_ = (finufft_plan *)mxWrapGetP(prhs[6], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  in7_ = (finufft_opts *)mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[8]++;
+  out0_ = finufft_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
 
 mw_err_label:
-    if (in2_)  mxFree(in2_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in2_) mxFree(in2_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 190 ----
- * int ier = finufftf_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int n_trans, float tol, finufftf_plan* plan, finufft_opts* o);
+ * int ier = finufftf_makeplan(int type, int dim, int64_t[3] n_modes, int iflag, int
+ * n_trans, float tol, finufftf_plan* plan, finufft_opts* o);
  */
-static const char* stubids9_ = "o int = finufftf_makeplan(i int, i int, i int64_t[x], i int, i int, i float, i finufftf_plan*, i finufft_opts*)";
-
-void mexStub9(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    int         in0_;    /* type       */
-    int         in1_;    /* dim        */
-    int64_t*    in2_ =0; /* n_modes    */
-    int         in3_;    /* iflag      */
-    int         in4_;    /* n_trans    */
-    float       in5_;    /* tol        */
-    finufftf_plan*  in6_ =0; /* plan       */
-    finufft_opts*  in7_ =0; /* o          */
-    int         out0_;   /* ier        */
-    mwSize      dim8_;   /* 3          */
-
-    dim8_ = (mwSize) mxWrapGetScalar(prhs[8], &mw_err_txt_);
-
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != dim8_) {
-        mw_err_txt_ = "Bad argument size: n_modes";        goto mw_err_label;
-    }
-
-    if( mxGetClassID(prhs[0]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in0_ = (int) mxWrapGetScalar(prhs[0], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in1_ = (int) mxWrapGetScalar(prhs[1], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) {
-        in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in2_ = NULL;
-    if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+static const char *stubids9_ = "o int = finufftf_makeplan(i int, i int, i int64_t[x], i "
+                               "int, i int, i float, i finufftf_plan*, i finufft_opts*)";
+
+void mexStub9(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  int in0_;                /* type       */
+  int in1_;                /* dim        */
+  int64_t *in2_ = 0;       /* n_modes    */
+  int in3_;                /* iflag      */
+  int in4_;                /* n_trans    */
+  float in5_;              /* tol        */
+  finufftf_plan *in6_ = 0; /* plan       */
+  finufft_opts *in7_  = 0; /* o          */
+  int out0_;               /* ier        */
+  mwSize dim8_;            /* 3          */
+
+  dim8_ = (mwSize)mxWrapGetScalar(prhs[8], &mw_err_txt_);
+
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != dim8_) {
+    mw_err_txt_ = "Bad argument size: n_modes";
+    goto mw_err_label;
+  }
+
+  if (mxGetClassID(prhs[0]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in0_ = (int)mxWrapGetScalar(prhs[0], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in1_ = (int)mxWrapGetScalar(prhs[1], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) {
+    in2_ = mxWrapGetArray_int64_t(prhs[2], &mw_err_txt_);
     if (mw_err_txt_) goto mw_err_label;
-    in3_ = (int) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in4_ = (int) mxWrapGetScalar(prhs[4], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[5]) != mxSINGLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxSINGLE_CLASS expected";
-    if (mw_err_txt_) goto mw_err_label;
-    in5_ = (float) mxWrapGetScalar_single(prhs[5], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    in6_ = (finufftf_plan*) mxWrapGetP(prhs[6], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    in7_ = (finufft_opts*) mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[9]++;
-    out0_ = finufftf_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_);
+  } else
+    in2_ = NULL;
+  if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in3_ = (int)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in4_ = (int)mxWrapGetScalar(prhs[4], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[5]) != mxSINGLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxSINGLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in5_ = (float)mxWrapGetScalar_single(prhs[5], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  in6_ = (finufftf_plan *)mxWrapGetP(prhs[6], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  in7_ = (finufft_opts *)mxWrapGetP(prhs[7], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[9]++;
+  out0_ = finufftf_makeplan(in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
 
 mw_err_label:
-    if (in2_)  mxFree(in2_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in2_) mxFree(in2_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 192 ----
  * delete(finufft_opts* o);
  */
-static const char* stubids10_ = "delete(i finufft_opts*)";
+static const char *stubids10_ = "delete(i finufft_opts*)";
 
-void mexStub10(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_opts*  in0_ =0; /* o          */
+void mexStub10(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_opts *in0_      = 0; /* o          */
 
-    in0_ = (finufft_opts*) mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mexprofrecord_)
-        mexprofrecord_[10]++;
-    delete(in0_);
+  in0_ = (finufft_opts *)mxWrapGetP(prhs[0], "finufft_opts:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mexprofrecord_) mexprofrecord_[10]++;
+  delete (in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 222 ----
- * int ier = finufft_setpts(finufft_plan plan, int64_t nj, double[] xj, double[] yj, double[] zj, int64_t nk, double[] s, double[] t, double[] u);
+ * int ier = finufft_setpts(finufft_plan plan, int64_t nj, double[] xj, double[] yj,
+ * double[] zj, int64_t nk, double[] s, double[] t, double[] u);
  */
-static const char* stubids11_ = "o int = finufft_setpts(i finufft_plan, i int64_t, i double[], i double[], i double[], i int64_t, i double[], i double[], i double[])";
-
-void mexStub11(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-    int64_t     in1_;    /* nj         */
-    double*     in2_ =0; /* xj         */
-    double*     in3_ =0; /* yj         */
-    double*     in4_ =0; /* zj         */
-    int64_t     in5_;    /* nk         */
-    double*     in6_ =0; /* s          */
-    double*     in7_ =0; /* t          */
-    double*     in8_ =0; /* u          */
-    int         out0_;   /* ier        */
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+static const char *stubids11_ =
+    "o int = finufft_setpts(i finufft_plan, i int64_t, i double[], i double[], i "
+    "double[], i int64_t, i double[], i double[], i double[])";
+
+void mexStub11(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+  int64_t in1_;                /* nj         */
+  double *in2_ = 0;            /* xj         */
+  double *in3_ = 0;            /* yj         */
+  double *in4_ = 0;            /* zj         */
+  int64_t in5_;                /* nk         */
+  double *in6_ = 0;            /* s          */
+  double *in7_ = 0;            /* t          */
+  double *in8_ = 0;            /* u          */
+  int out0_;                   /* ier        */
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in1_ = (int64_t)mxWrapGetScalar(prhs[1], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) {
+    if (mxGetClassID(prhs[2]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
     if (mw_err_txt_) goto mw_err_label;
-    in1_ = (int64_t) mxWrapGetScalar(prhs[1], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) {
-        if( mxGetClassID(prhs[2]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in2_ = mxGetDoubles(prhs[2]);
+    in2_ = mxGetDoubles(prhs[2]);
 #else
-        in2_ = mxGetPr(prhs[2]);
+    in2_ = mxGetPr(prhs[2]);
 #endif
-    } else
-        in2_ = NULL;
-    if (mxGetM(prhs[3])*mxGetN(prhs[3]) != 0) {
-        if( mxGetClassID(prhs[3]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in2_ = NULL;
+  if (mxGetM(prhs[3]) * mxGetN(prhs[3]) != 0) {
+    if (mxGetClassID(prhs[3]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in3_ = mxGetDoubles(prhs[3]);
+    in3_ = mxGetDoubles(prhs[3]);
 #else
-        in3_ = mxGetPr(prhs[3]);
+    in3_ = mxGetPr(prhs[3]);
 #endif
-    } else
-        in3_ = NULL;
-    if (mxGetM(prhs[4])*mxGetN(prhs[4]) != 0) {
-        if( mxGetClassID(prhs[4]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in3_ = NULL;
+  if (mxGetM(prhs[4]) * mxGetN(prhs[4]) != 0) {
+    if (mxGetClassID(prhs[4]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in4_ = mxGetDoubles(prhs[4]);
+    in4_ = mxGetDoubles(prhs[4]);
 #else
-        in4_ = mxGetPr(prhs[4]);
+    in4_ = mxGetPr(prhs[4]);
 #endif
-    } else
-        in4_ = NULL;
-    if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  } else
+    in4_ = NULL;
+  if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in5_ = (int64_t)mxWrapGetScalar(prhs[5], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[6]) * mxGetN(prhs[6]) != 0) {
+    if (mxGetClassID(prhs[6]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
     if (mw_err_txt_) goto mw_err_label;
-    in5_ = (int64_t) mxWrapGetScalar(prhs[5], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[6])*mxGetN(prhs[6]) != 0) {
-        if( mxGetClassID(prhs[6]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in6_ = mxGetDoubles(prhs[6]);
+    in6_ = mxGetDoubles(prhs[6]);
 #else
-        in6_ = mxGetPr(prhs[6]);
+    in6_ = mxGetPr(prhs[6]);
 #endif
-    } else
-        in6_ = NULL;
-    if (mxGetM(prhs[7])*mxGetN(prhs[7]) != 0) {
-        if( mxGetClassID(prhs[7]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in6_ = NULL;
+  if (mxGetM(prhs[7]) * mxGetN(prhs[7]) != 0) {
+    if (mxGetClassID(prhs[7]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in7_ = mxGetDoubles(prhs[7]);
+    in7_ = mxGetDoubles(prhs[7]);
 #else
-        in7_ = mxGetPr(prhs[7]);
+    in7_ = mxGetPr(prhs[7]);
 #endif
-    } else
-        in7_ = NULL;
-    if (mxGetM(prhs[8])*mxGetN(prhs[8]) != 0) {
-        if( mxGetClassID(prhs[8]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in7_ = NULL;
+  if (mxGetM(prhs[8]) * mxGetN(prhs[8]) != 0) {
+    if (mxGetClassID(prhs[8]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in8_ = mxGetDoubles(prhs[8]);
+    in8_ = mxGetDoubles(prhs[8]);
 #else
-        in8_ = mxGetPr(prhs[8]);
+    in8_ = mxGetPr(prhs[8]);
 #endif
-    } else
-        in8_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    if (mexprofrecord_)
-        mexprofrecord_[11]++;
-    out0_ = finufft_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_);
+  } else
+    in8_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  if (mexprofrecord_) mexprofrecord_[11]++;
+  out0_ = finufft_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 224 ----
- * int ier = finufftf_setpts(finufftf_plan plan, int64_t nj, float[] xj, float[] yj, float[] zj, int64_t nk, float[] s, float[] t, float[] u);
+ * int ier = finufftf_setpts(finufftf_plan plan, int64_t nj, float[] xj, float[] yj,
+ * float[] zj, int64_t nk, float[] s, float[] t, float[] u);
  */
-static const char* stubids12_ = "o int = finufftf_setpts(i finufftf_plan, i int64_t, i float[], i float[], i float[], i int64_t, i float[], i float[], i float[])";
-
-void mexStub12(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-    int64_t     in1_;    /* nj         */
-    float*      in2_ =0; /* xj         */
-    float*      in3_ =0; /* yj         */
-    float*      in4_ =0; /* zj         */
-    int64_t     in5_;    /* nk         */
-    float*      in6_ =0; /* s          */
-    float*      in7_ =0; /* t          */
-    float*      in8_ =0; /* u          */
-    int         out0_;   /* ier        */
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+static const char *stubids12_ =
+    "o int = finufftf_setpts(i finufftf_plan, i int64_t, i float[], i float[], i "
+    "float[], i int64_t, i float[], i float[], i float[])";
+
+void mexStub12(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+  int64_t in1_;                /* nj         */
+  float *in2_ = 0;             /* xj         */
+  float *in3_ = 0;             /* yj         */
+  float *in4_ = 0;             /* zj         */
+  int64_t in5_;                /* nk         */
+  float *in6_ = 0;             /* s          */
+  float *in7_ = 0;             /* t          */
+  float *in8_ = 0;             /* u          */
+  int out0_;                   /* ier        */
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in1_ = (int64_t)mxWrapGetScalar(prhs[1], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[2]) * mxGetN(prhs[2]) != 0) {
+    if (mxGetClassID(prhs[2]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
     if (mw_err_txt_) goto mw_err_label;
-    in1_ = (int64_t) mxWrapGetScalar(prhs[1], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[2])*mxGetN(prhs[2]) != 0) {
-        if( mxGetClassID(prhs[2]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in2_ = mxGetSingles(prhs[2]);
+    in2_ = mxGetSingles(prhs[2]);
 #else
-        in2_ = (float*) mxGetData(prhs[2]);
+    in2_ = (float *)mxGetData(prhs[2]);
 #endif
-    } else
-        in2_ = NULL;
-    if (mxGetM(prhs[3])*mxGetN(prhs[3]) != 0) {
-        if( mxGetClassID(prhs[3]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in2_ = NULL;
+  if (mxGetM(prhs[3]) * mxGetN(prhs[3]) != 0) {
+    if (mxGetClassID(prhs[3]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in3_ = mxGetSingles(prhs[3]);
+    in3_ = mxGetSingles(prhs[3]);
 #else
-        in3_ = (float*) mxGetData(prhs[3]);
+    in3_ = (float *)mxGetData(prhs[3]);
 #endif
-    } else
-        in3_ = NULL;
-    if (mxGetM(prhs[4])*mxGetN(prhs[4]) != 0) {
-        if( mxGetClassID(prhs[4]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in3_ = NULL;
+  if (mxGetM(prhs[4]) * mxGetN(prhs[4]) != 0) {
+    if (mxGetClassID(prhs[4]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in4_ = mxGetSingles(prhs[4]);
+    in4_ = mxGetSingles(prhs[4]);
 #else
-        in4_ = (float*) mxGetData(prhs[4]);
+    in4_ = (float *)mxGetData(prhs[4]);
 #endif
-    } else
-        in4_ = NULL;
-    if( mxGetClassID(prhs[5]) != mxDOUBLE_CLASS )
-        mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  } else
+    in4_ = NULL;
+  if (mxGetClassID(prhs[5]) != mxDOUBLE_CLASS)
+    mw_err_txt_ = "Invalid scalar argument, mxDOUBLE_CLASS expected";
+  if (mw_err_txt_) goto mw_err_label;
+  in5_ = (int64_t)mxWrapGetScalar(prhs[5], &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[6]) * mxGetN(prhs[6]) != 0) {
+    if (mxGetClassID(prhs[6]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
     if (mw_err_txt_) goto mw_err_label;
-    in5_ = (int64_t) mxWrapGetScalar(prhs[5], &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[6])*mxGetN(prhs[6]) != 0) {
-        if( mxGetClassID(prhs[6]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in6_ = mxGetSingles(prhs[6]);
+    in6_ = mxGetSingles(prhs[6]);
 #else
-        in6_ = (float*) mxGetData(prhs[6]);
+    in6_ = (float *)mxGetData(prhs[6]);
 #endif
-    } else
-        in6_ = NULL;
-    if (mxGetM(prhs[7])*mxGetN(prhs[7]) != 0) {
-        if( mxGetClassID(prhs[7]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in6_ = NULL;
+  if (mxGetM(prhs[7]) * mxGetN(prhs[7]) != 0) {
+    if (mxGetClassID(prhs[7]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in7_ = mxGetSingles(prhs[7]);
+    in7_ = mxGetSingles(prhs[7]);
 #else
-        in7_ = (float*) mxGetData(prhs[7]);
+    in7_ = (float *)mxGetData(prhs[7]);
 #endif
-    } else
-        in7_ = NULL;
-    if (mxGetM(prhs[8])*mxGetN(prhs[8]) != 0) {
-        if( mxGetClassID(prhs[8]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
+  } else
+    in7_ = NULL;
+  if (mxGetM(prhs[8]) * mxGetN(prhs[8]) != 0) {
+    if (mxGetClassID(prhs[8]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
 #if MX_HAS_INTERLEAVED_COMPLEX
-        in8_ = mxGetSingles(prhs[8]);
+    in8_ = mxGetSingles(prhs[8]);
 #else
-        in8_ = (float*) mxGetData(prhs[8]);
+    in8_ = (float *)mxGetData(prhs[8]);
 #endif
-    } else
-        in8_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    if (mexprofrecord_)
-        mexprofrecord_[12]++;
-    out0_ = finufftf_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_);
+  } else
+    in8_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  if (mexprofrecord_) mexprofrecord_[12]++;
+  out0_ = finufftf_setpts(*in0_, in1_, in2_, in3_, in4_, in5_, in6_, in7_, in8_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 251 ----
- * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[ncoeffs] result);
+ * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output
+ * dcomplex[ncoeffs] result);
  */
-static const char* stubids13_ = "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[x])";
-
-void mexStub13(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-    dcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    dcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* ncoeffs    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (dcomplex*) mxMalloc(dim2_*sizeof(dcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[13]++;
-    out0_ = finufft_execute(*in0_, in1_, out1_);
+static const char *stubids13_ =
+    "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[x])";
+
+void mexStub13(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+  dcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  dcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* ncoeffs    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (dcomplex *)mxMalloc(dim2_ * sizeof(dcomplex));
+  if (mexprofrecord_) mexprofrecord_[13]++;
+  out0_ = finufft_execute(*in0_, in1_, out1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateDoubleMatrix(dim2_, 1, mxCOMPLEX);
-    mxWrapCopy_dcomplex(plhs[1], out1_, dim2_);
+  plhs[1] = mxCreateDoubleMatrix(dim2_, 1, mxCOMPLEX);
+  mxWrapCopy_dcomplex(plhs[1], out1_, dim2_);
 
 mw_err_label:
-    if (in1_)  mxFree(in1_);
-    if (out1_) mxFree(out1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in1_) mxFree(in1_);
+  if (out1_) mxFree(out1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 253 ----
- * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[ncoeffs] result);
+ * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output
+ * fcomplex[ncoeffs] result);
  */
-static const char* stubids14_ = "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[x])";
-
-void mexStub14(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-    fcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    fcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* ncoeffs    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (fcomplex*) mxMalloc(dim2_*sizeof(fcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[14]++;
-    out0_ = finufftf_execute(*in0_, in1_, out1_);
+static const char *stubids14_ =
+    "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[x])";
+
+void mexStub14(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+  fcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  fcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* ncoeffs    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (fcomplex *)mxMalloc(dim2_ * sizeof(fcomplex));
+  if (mexprofrecord_) mexprofrecord_[14]++;
+  out0_ = finufftf_execute(*in0_, in1_, out1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateNumericMatrix(dim2_, 1, mxSINGLE_CLASS, mxCOMPLEX);
-    mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_);
+  plhs[1] = mxCreateNumericMatrix(dim2_, 1, mxSINGLE_CLASS, mxCOMPLEX);
+  mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_);
 
 mw_err_label:
-    if (in1_)  mxFree(in1_);
-    if (out1_) mxFree(out1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in1_) mxFree(in1_);
+  if (out1_) mxFree(out1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 259 ----
- * int ier = finufft_execute(finufft_plan plan, output dcomplex[nj, n_trans] result, dcomplex[] data_in);
+ * int ier = finufft_execute(finufft_plan plan, output dcomplex[nj, n_trans] result,
+ * dcomplex[] data_in);
  */
-static const char* stubids15_ = "o int = finufft_execute(i finufft_plan, o dcomplex[xx], i dcomplex[])";
-
-void mexStub15(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-    dcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    dcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* nj         */
-    mwSize      dim3_;   /* n_trans    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-    dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (dcomplex*) mxMalloc(dim2_*dim3_*sizeof(dcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[15]++;
-    out0_ = finufft_execute(*in0_, out1_, in1_);
+static const char *stubids15_ =
+    "o int = finufft_execute(i finufft_plan, o dcomplex[xx], i dcomplex[])";
+
+void mexStub15(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+  dcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  dcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* nj         */
+  mwSize dim3_;                /* n_trans    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+  dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (dcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(dcomplex));
+  if (mexprofrecord_) mexprofrecord_[15]++;
+  out0_ = finufft_execute(*in0_, out1_, in1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX);
-    mxWrapCopy_dcomplex(plhs[1], out1_, dim2_*dim3_);
+  plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX);
+  mxWrapCopy_dcomplex(plhs[1], out1_, dim2_ * dim3_);
 
 mw_err_label:
-    if (out1_) mxFree(out1_);
-    if (in1_)  mxFree(in1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (out1_) mxFree(out1_);
+  if (in1_) mxFree(in1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 261 ----
- * int ier = finufftf_execute(finufftf_plan plan, output fcomplex[nj, n_trans] result, fcomplex[] data_in);
+ * int ier = finufftf_execute(finufftf_plan plan, output fcomplex[nj, n_trans] result,
+ * fcomplex[] data_in);
  */
-static const char* stubids16_ = "o int = finufftf_execute(i finufftf_plan, o fcomplex[xx], i fcomplex[])";
-
-void mexStub16(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-    fcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    fcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* nj         */
-    mwSize      dim3_;   /* n_trans    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-    dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (fcomplex*) mxMalloc(dim2_*dim3_*sizeof(fcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[16]++;
-    out0_ = finufftf_execute(*in0_, out1_, in1_);
+static const char *stubids16_ =
+    "o int = finufftf_execute(i finufftf_plan, o fcomplex[xx], i fcomplex[])";
+
+void mexStub16(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+  fcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  fcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* nj         */
+  mwSize dim3_;                /* n_trans    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+  dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (fcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(fcomplex));
+  if (mexprofrecord_) mexprofrecord_[16]++;
+  out0_ = finufftf_execute(*in0_, out1_, in1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX);
-    mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_*dim3_);
+  plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX);
+  mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_ * dim3_);
 
 mw_err_label:
-    if (out1_) mxFree(out1_);
-    if (in1_)  mxFree(in1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (out1_) mxFree(out1_);
+  if (in1_) mxFree(in1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 265 ----
- * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[nk, n_trans] result);
+ * int ier = finufft_execute(finufft_plan plan, dcomplex[] data_in, output dcomplex[nk,
+ * n_trans] result);
  */
-static const char* stubids17_ = "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[xx])";
-
-void mexStub17(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-    dcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    dcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* nk         */
-    mwSize      dim3_;   /* n_trans    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-    dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxDOUBLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (dcomplex*) mxMalloc(dim2_*dim3_*sizeof(dcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[17]++;
-    out0_ = finufft_execute(*in0_, in1_, out1_);
+static const char *stubids17_ =
+    "o int = finufft_execute(i finufft_plan, i dcomplex[], o dcomplex[xx])";
+
+void mexStub17(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+  dcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  dcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* nk         */
+  mwSize dim3_;                /* n_trans    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+  dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxDOUBLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxDOUBLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_dcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (dcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(dcomplex));
+  if (mexprofrecord_) mexprofrecord_[17]++;
+  out0_ = finufft_execute(*in0_, in1_, out1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX);
-    mxWrapCopy_dcomplex(plhs[1], out1_, dim2_*dim3_);
+  plhs[1] = mxCreateDoubleMatrix(dim2_, dim3_, mxCOMPLEX);
+  mxWrapCopy_dcomplex(plhs[1], out1_, dim2_ * dim3_);
 
 mw_err_label:
-    if (in1_)  mxFree(in1_);
-    if (out1_) mxFree(out1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in1_) mxFree(in1_);
+  if (out1_) mxFree(out1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 267 ----
- * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[nk, n_trans] result);
+ * int ier = finufftf_execute(finufftf_plan plan, fcomplex[] data_in, output fcomplex[nk,
+ * n_trans] result);
  */
-static const char* stubids18_ = "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[xx])";
-
-void mexStub18(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-    fcomplex*   in1_ =0; /* data_in    */
-    int         out0_;   /* ier        */
-    fcomplex*   out1_=0; /* result     */
-    mwSize      dim2_;   /* nk         */
-    mwSize      dim3_;   /* n_trans    */
-
-    dim2_ = (mwSize) mxWrapGetScalar(prhs[2], &mw_err_txt_);
-    dim3_ = (mwSize) mxWrapGetScalar(prhs[3], &mw_err_txt_);
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (mxGetM(prhs[1])*mxGetN(prhs[1]) != 0) {
-        if( mxGetClassID(prhs[1]) != mxSINGLE_CLASS )
-            mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
-        if (mw_err_txt_) goto mw_err_label;
-        in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
-        if (mw_err_txt_)
-            goto mw_err_label;
-    } else
-        in1_ = NULL;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    out1_ = (fcomplex*) mxMalloc(dim2_*dim3_*sizeof(fcomplex));
-    if (mexprofrecord_)
-        mexprofrecord_[18]++;
-    out0_ = finufftf_execute(*in0_, in1_, out1_);
+static const char *stubids18_ =
+    "o int = finufftf_execute(i finufftf_plan, i fcomplex[], o fcomplex[xx])";
+
+void mexStub18(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+  fcomplex *in1_          = 0; /* data_in    */
+  int out0_;                   /* ier        */
+  fcomplex *out1_ = 0;         /* result     */
+  mwSize dim2_;                /* nk         */
+  mwSize dim3_;                /* n_trans    */
+
+  dim2_ = (mwSize)mxWrapGetScalar(prhs[2], &mw_err_txt_);
+  dim3_ = (mwSize)mxWrapGetScalar(prhs[3], &mw_err_txt_);
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (mxGetM(prhs[1]) * mxGetN(prhs[1]) != 0) {
+    if (mxGetClassID(prhs[1]) != mxSINGLE_CLASS)
+      mw_err_txt_ = "Invalid array argument, mxSINGLE_CLASS expected";
+    if (mw_err_txt_) goto mw_err_label;
+    in1_ = mxWrapGetArray_single_fcomplex(prhs[1], &mw_err_txt_);
+    if (mw_err_txt_) goto mw_err_label;
+  } else
+    in1_ = NULL;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  out1_ = (fcomplex *)mxMalloc(dim2_ * dim3_ * sizeof(fcomplex));
+  if (mexprofrecord_) mexprofrecord_[18]++;
+  out0_ = finufftf_execute(*in0_, in1_, out1_);
 #if MX_HAS_INTERLEAVED_COMPLEX
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetDoubles(plhs[0]) = out0_;
+  plhs[0]                = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetDoubles(plhs[0]) = out0_;
 #else
-    plhs[0] = mxCreateDoubleMatrix(1, 1, mxREAL);
-    *mxGetPr(plhs[0]) = out0_;
+  plhs[0]           = mxCreateDoubleMatrix(1, 1, mxREAL);
+  *mxGetPr(plhs[0]) = out0_;
 #endif
-    plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX);
-    mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_*dim3_);
+  plhs[1] = mxCreateNumericMatrix(dim2_, dim3_, mxSINGLE_CLASS, mxCOMPLEX);
+  mxWrapCopy_single_fcomplex(plhs[1], out1_, dim2_ * dim3_);
 
 mw_err_label:
-    if (in1_)  mxFree(in1_);
-    if (out1_) mxFree(out1_);
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (in1_) mxFree(in1_);
+  if (out1_) mxFree(out1_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 279 ----
  * finufft_destroy(finufft_plan plan);
  */
-static const char* stubids19_ = "finufft_destroy(i finufft_plan)";
-
-void mexStub19(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufft_plan*  in0_ =0; /* plan       */
-
-    in0_ = (finufft_plan*) mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    if (mexprofrecord_)
-        mexprofrecord_[19]++;
-    finufft_destroy(*in0_);
+static const char *stubids19_ = "finufft_destroy(i finufft_plan)";
+
+void mexStub19(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufft_plan *in0_      = 0; /* plan       */
+
+  in0_ = (finufft_plan *)mxWrapGetP(prhs[0], "finufft_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  if (mexprofrecord_) mexprofrecord_[19]++;
+  finufft_destroy(*in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ---- finufft.mw: 281 ----
  * finufftf_destroy(finufftf_plan plan);
  */
-static const char* stubids20_ = "finufftf_destroy(i finufftf_plan)";
-
-void mexStub20(int nlhs, mxArray* plhs[],
-              int nrhs, const mxArray* prhs[])
-{
-    const char* mw_err_txt_ = 0;
-    finufftf_plan*  in0_ =0; /* plan       */
-
-    in0_ = (finufftf_plan*) mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
-    if (mw_err_txt_)
-        goto mw_err_label;
-    if (!in0_) {
-        mw_err_txt_ = "Argument plan cannot be null";
-        goto mw_err_label;
-    }
-    if (mexprofrecord_)
-        mexprofrecord_[20]++;
-    finufftf_destroy(*in0_);
+static const char *stubids20_ = "finufftf_destroy(i finufftf_plan)";
+
+void mexStub20(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  const char *mw_err_txt_ = 0;
+  finufftf_plan *in0_     = 0; /* plan       */
+
+  in0_ = (finufftf_plan *)mxWrapGetP(prhs[0], "finufftf_plan:%p", &mw_err_txt_);
+  if (mw_err_txt_) goto mw_err_label;
+  if (!in0_) {
+    mw_err_txt_ = "Argument plan cannot be null";
+    goto mw_err_label;
+  }
+  if (mexprofrecord_) mexprofrecord_[20]++;
+  finufftf_destroy(*in0_);
 
 mw_err_label:
-    if (mw_err_txt_)
-        mexErrMsgTxt(mw_err_txt_);
+  if (mw_err_txt_) mexErrMsgTxt(mw_err_txt_);
 }
 
 /* ----
  */
-void mexFunction(int nlhs, mxArray* plhs[],
-                 int nrhs, const mxArray* prhs[])
-{
-    char id[512];
-    if (nrhs == 0) {
-        mexPrintf("Mex function installed\n");
-        return;
+void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+  char id[512];
+  if (nrhs == 0) {
+    mexPrintf("Mex function installed\n");
+    return;
+  }
+
+  if (mxGetString(prhs[0], id, sizeof(id)) != 0)
+    mexErrMsgTxt("Identifier should be a string");
+  else if (strcmp(id, stubids1_) == 0)
+    mexStub1(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids2_) == 0)
+    mexStub2(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids3_) == 0)
+    mexStub3(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids4_) == 0)
+    mexStub4(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids5_) == 0)
+    mexStub5(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids6_) == 0)
+    mexStub6(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids7_) == 0)
+    mexStub7(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids8_) == 0)
+    mexStub8(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids9_) == 0)
+    mexStub9(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids10_) == 0)
+    mexStub10(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids11_) == 0)
+    mexStub11(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids12_) == 0)
+    mexStub12(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids13_) == 0)
+    mexStub13(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids14_) == 0)
+    mexStub14(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids15_) == 0)
+    mexStub15(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids16_) == 0)
+    mexStub16(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids17_) == 0)
+    mexStub17(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids18_) == 0)
+    mexStub18(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids19_) == 0)
+    mexStub19(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, stubids20_) == 0)
+    mexStub20(nlhs, plhs, nrhs - 1, prhs + 1);
+  else if (strcmp(id, "*profile on*") == 0) {
+    if (!mexprofrecord_) {
+      mexprofrecord_ = (int *)malloc(21 * sizeof(int));
+      mexLock();
     }
-
-    if (mxGetString(prhs[0], id, sizeof(id)) != 0)
-        mexErrMsgTxt("Identifier should be a string");
-    else if (strcmp(id, stubids1_) == 0)
-        mexStub1(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids2_) == 0)
-        mexStub2(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids3_) == 0)
-        mexStub3(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids4_) == 0)
-        mexStub4(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids5_) == 0)
-        mexStub5(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids6_) == 0)
-        mexStub6(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids7_) == 0)
-        mexStub7(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids8_) == 0)
-        mexStub8(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids9_) == 0)
-        mexStub9(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids10_) == 0)
-        mexStub10(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids11_) == 0)
-        mexStub11(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids12_) == 0)
-        mexStub12(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids13_) == 0)
-        mexStub13(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids14_) == 0)
-        mexStub14(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids15_) == 0)
-        mexStub15(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids16_) == 0)
-        mexStub16(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids17_) == 0)
-        mexStub17(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids18_) == 0)
-        mexStub18(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids19_) == 0)
-        mexStub19(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, stubids20_) == 0)
-        mexStub20(nlhs,plhs, nrhs-1,prhs+1);
-    else if (strcmp(id, "*profile on*") == 0) {
-        if (!mexprofrecord_) {
-            mexprofrecord_ = (int*) malloc(21 * sizeof(int));
-            mexLock();
-        }
-        memset(mexprofrecord_, 0, 21 * sizeof(int));
-    } else if (strcmp(id, "*profile off*") == 0) {
-        if (mexprofrecord_) {
-            free(mexprofrecord_);
-            mexUnlock();
-        }
-        mexprofrecord_ = NULL;
-    } else if (strcmp(id, "*profile report*") == 0) {
-        if (!mexprofrecord_)
-            mexPrintf("Profiler inactive\n");
-        mexPrintf("%d calls to finufft.mw:166\n", mexprofrecord_[1]);
-        mexPrintf("%d calls to finufft.mw:167\n", mexprofrecord_[2]);
-        mexPrintf("%d calls to finufft.mw:169\n", mexprofrecord_[3]);
-        mexPrintf("%d calls to finufft.mw:170\n", mexprofrecord_[4]);
-        mexPrintf("%d calls to finufft.mw:172\n", mexprofrecord_[5]);
-        mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[6]);
-        mexPrintf("%d calls to finufft.mw:184\n", mexprofrecord_[7]);
-        mexPrintf("%d calls to finufft.mw:187\n", mexprofrecord_[8]);
-        mexPrintf("%d calls to finufft.mw:190\n", mexprofrecord_[9]);
-        mexPrintf("%d calls to finufft.mw:192\n", mexprofrecord_[10]);
-        mexPrintf("%d calls to finufft.mw:222\n", mexprofrecord_[11]);
-        mexPrintf("%d calls to finufft.mw:224\n", mexprofrecord_[12]);
-        mexPrintf("%d calls to finufft.mw:251\n", mexprofrecord_[13]);
-        mexPrintf("%d calls to finufft.mw:253\n", mexprofrecord_[14]);
-        mexPrintf("%d calls to finufft.mw:259\n", mexprofrecord_[15]);
-        mexPrintf("%d calls to finufft.mw:261\n", mexprofrecord_[16]);
-        mexPrintf("%d calls to finufft.mw:265\n", mexprofrecord_[17]);
-        mexPrintf("%d calls to finufft.mw:267\n", mexprofrecord_[18]);
-        mexPrintf("%d calls to finufft.mw:279\n", mexprofrecord_[19]);
-        mexPrintf("%d calls to finufft.mw:281\n", mexprofrecord_[20]);
-    } else if (strcmp(id, "*profile log*") == 0) {
-        FILE* logfp;
-        if (nrhs != 2 || mxGetString(prhs[1], id, sizeof(id)) != 0)
-            mexErrMsgTxt("Must have two string arguments");
-        logfp = fopen(id, "w+");
-        if (!logfp)
-            mexErrMsgTxt("Cannot open log for output");
-        if (!mexprofrecord_)
-            fprintf(logfp, "Profiler inactive\n");
-        fprintf(logfp, "%d calls to finufft.mw:166\n", mexprofrecord_[1]);
-        fprintf(logfp, "%d calls to finufft.mw:167\n", mexprofrecord_[2]);
-        fprintf(logfp, "%d calls to finufft.mw:169\n", mexprofrecord_[3]);
-        fprintf(logfp, "%d calls to finufft.mw:170\n", mexprofrecord_[4]);
-        fprintf(logfp, "%d calls to finufft.mw:172\n", mexprofrecord_[5]);
-        fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[6]);
-        fprintf(logfp, "%d calls to finufft.mw:184\n", mexprofrecord_[7]);
-        fprintf(logfp, "%d calls to finufft.mw:187\n", mexprofrecord_[8]);
-        fprintf(logfp, "%d calls to finufft.mw:190\n", mexprofrecord_[9]);
-        fprintf(logfp, "%d calls to finufft.mw:192\n", mexprofrecord_[10]);
-        fprintf(logfp, "%d calls to finufft.mw:222\n", mexprofrecord_[11]);
-        fprintf(logfp, "%d calls to finufft.mw:224\n", mexprofrecord_[12]);
-        fprintf(logfp, "%d calls to finufft.mw:251\n", mexprofrecord_[13]);
-        fprintf(logfp, "%d calls to finufft.mw:253\n", mexprofrecord_[14]);
-        fprintf(logfp, "%d calls to finufft.mw:259\n", mexprofrecord_[15]);
-        fprintf(logfp, "%d calls to finufft.mw:261\n", mexprofrecord_[16]);
-        fprintf(logfp, "%d calls to finufft.mw:265\n", mexprofrecord_[17]);
-        fprintf(logfp, "%d calls to finufft.mw:267\n", mexprofrecord_[18]);
-        fprintf(logfp, "%d calls to finufft.mw:279\n", mexprofrecord_[19]);
-        fprintf(logfp, "%d calls to finufft.mw:281\n", mexprofrecord_[20]);
-        fclose(logfp);
-    } else
-        mexErrMsgTxt("Unknown identifier");
+    memset(mexprofrecord_, 0, 21 * sizeof(int));
+  } else if (strcmp(id, "*profile off*") == 0) {
+    if (mexprofrecord_) {
+      free(mexprofrecord_);
+      mexUnlock();
+    }
+    mexprofrecord_ = NULL;
+  } else if (strcmp(id, "*profile report*") == 0) {
+    if (!mexprofrecord_) mexPrintf("Profiler inactive\n");
+    mexPrintf("%d calls to finufft.mw:166\n", mexprofrecord_[1]);
+    mexPrintf("%d calls to finufft.mw:167\n", mexprofrecord_[2]);
+    mexPrintf("%d calls to finufft.mw:169\n", mexprofrecord_[3]);
+    mexPrintf("%d calls to finufft.mw:170\n", mexprofrecord_[4]);
+    mexPrintf("%d calls to finufft.mw:172\n", mexprofrecord_[5]);
+    mexPrintf("%d calls to finufft.mw:173\n", mexprofrecord_[6]);
+    mexPrintf("%d calls to finufft.mw:184\n", mexprofrecord_[7]);
+    mexPrintf("%d calls to finufft.mw:187\n", mexprofrecord_[8]);
+    mexPrintf("%d calls to finufft.mw:190\n", mexprofrecord_[9]);
+    mexPrintf("%d calls to finufft.mw:192\n", mexprofrecord_[10]);
+    mexPrintf("%d calls to finufft.mw:222\n", mexprofrecord_[11]);
+    mexPrintf("%d calls to finufft.mw:224\n", mexprofrecord_[12]);
+    mexPrintf("%d calls to finufft.mw:251\n", mexprofrecord_[13]);
+    mexPrintf("%d calls to finufft.mw:253\n", mexprofrecord_[14]);
+    mexPrintf("%d calls to finufft.mw:259\n", mexprofrecord_[15]);
+    mexPrintf("%d calls to finufft.mw:261\n", mexprofrecord_[16]);
+    mexPrintf("%d calls to finufft.mw:265\n", mexprofrecord_[17]);
+    mexPrintf("%d calls to finufft.mw:267\n", mexprofrecord_[18]);
+    mexPrintf("%d calls to finufft.mw:279\n", mexprofrecord_[19]);
+    mexPrintf("%d calls to finufft.mw:281\n", mexprofrecord_[20]);
+  } else if (strcmp(id, "*profile log*") == 0) {
+    FILE *logfp;
+    if (nrhs != 2 || mxGetString(prhs[1], id, sizeof(id)) != 0)
+      mexErrMsgTxt("Must have two string arguments");
+    logfp = fopen(id, "w+");
+    if (!logfp) mexErrMsgTxt("Cannot open log for output");
+    if (!mexprofrecord_) fprintf(logfp, "Profiler inactive\n");
+    fprintf(logfp, "%d calls to finufft.mw:166\n", mexprofrecord_[1]);
+    fprintf(logfp, "%d calls to finufft.mw:167\n", mexprofrecord_[2]);
+    fprintf(logfp, "%d calls to finufft.mw:169\n", mexprofrecord_[3]);
+    fprintf(logfp, "%d calls to finufft.mw:170\n", mexprofrecord_[4]);
+    fprintf(logfp, "%d calls to finufft.mw:172\n", mexprofrecord_[5]);
+    fprintf(logfp, "%d calls to finufft.mw:173\n", mexprofrecord_[6]);
+    fprintf(logfp, "%d calls to finufft.mw:184\n", mexprofrecord_[7]);
+    fprintf(logfp, "%d calls to finufft.mw:187\n", mexprofrecord_[8]);
+    fprintf(logfp, "%d calls to finufft.mw:190\n", mexprofrecord_[9]);
+    fprintf(logfp, "%d calls to finufft.mw:192\n", mexprofrecord_[10]);
+    fprintf(logfp, "%d calls to finufft.mw:222\n", mexprofrecord_[11]);
+    fprintf(logfp, "%d calls to finufft.mw:224\n", mexprofrecord_[12]);
+    fprintf(logfp, "%d calls to finufft.mw:251\n", mexprofrecord_[13]);
+    fprintf(logfp, "%d calls to finufft.mw:253\n", mexprofrecord_[14]);
+    fprintf(logfp, "%d calls to finufft.mw:259\n", mexprofrecord_[15]);
+    fprintf(logfp, "%d calls to finufft.mw:261\n", mexprofrecord_[16]);
+    fprintf(logfp, "%d calls to finufft.mw:265\n", mexprofrecord_[17]);
+    fprintf(logfp, "%d calls to finufft.mw:267\n", mexprofrecord_[18]);
+    fprintf(logfp, "%d calls to finufft.mw:279\n", mexprofrecord_[19]);
+    fprintf(logfp, "%d calls to finufft.mw:281\n", mexprofrecord_[20]);
+    fclose(logfp);
+  } else
+    mexErrMsgTxt("Unknown identifier");
 }
-
diff --git a/perftest/big2d2f.cpp b/perftest/big2d2f.cpp
index 4b59a72df..1a87067d2 100644
--- a/perftest/big2d2f.cpp
+++ b/perftest/big2d2f.cpp
@@ -10,31 +10,29 @@
 #include <finufft.h>
 
 // also used in this example...
-#include <vector>
 #include <complex>
 #include <iostream>
 #include <omp.h>
+#include <vector>
 using namespace std;
 
-int test_finufft(finufft_opts* opts)
-{
-    size_t nj = 129*129*2;
-    size_t ms = 129, mt = 129;
-    size_t ntrans = 75000;     // the point is: 129*129*2*75000 > 2^31 ~ 2.15e9
-    std::vector<float> x(nj);   // bunch of zero data
-    std::vector<float> y(nj);
-    std::vector<std::complex<float>> cj(ntrans*nj);
-    std::vector<std::complex<float>> fk(ntrans*ms*mt);
+int test_finufft(finufft_opts *opts) {
+  size_t nj = 129 * 129 * 2;
+  size_t ms = 129, mt = 129;
+  size_t ntrans = 75000;    // the point is: 129*129*2*75000 > 2^31 ~ 2.15e9
+  std::vector<float> x(nj); // bunch of zero data
+  std::vector<float> y(nj);
+  std::vector<std::complex<float>> cj(ntrans * nj);
+  std::vector<std::complex<float>> fk(ntrans * ms * mt);
 
-    int ier = finufftf2d2many(ntrans, nj, x.data(), y.data(), cj.data(),
-                          -1, 1e-3, ms, mt, fk.data(), opts);
+  int ier = finufftf2d2many(ntrans, nj, x.data(), y.data(), cj.data(), -1, 1e-3, ms, mt,
+                            fk.data(), opts);
 
-    std::cout << "\tbig2d2f finufft status: " << ier << std::endl;
-    return ier;
+  std::cout << "\tbig2d2f finufft status: " << ier << std::endl;
+  return ier;
 }
 
-int main(int argc, char* argv[])
-{
+int main(int argc, char *argv[]) {
   finufft_opts opts;
   finufftf_default_opts(&opts);
   return test_finufft(&opts);
diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu
index 5b51fe3ac..f72ffb3e6 100644
--- a/perftest/cuda/cuperftest.cu
+++ b/perftest/cuda/cuperftest.cu
@@ -14,34 +14,34 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
-std::string get_or(const std::unordered_map<std::string, std::string> &m, const std::string &key,
-                   const std::string &default_value) {
-    auto it = m.find(key);
-    if (it == m.end()) {
-        return default_value;
-    }
-    return it->second;
+std::string get_or(const std::unordered_map<std::string, std::string> &m,
+                   const std::string &key, const std::string &default_value) {
+  auto it = m.find(key);
+  if (it == m.end()) {
+    return default_value;
+  }
+  return it->second;
 }
 
 struct test_options_t {
-    char prec;
-    int type;
-    int n_runs;
-    int N[3];
-    int M;
-    int ntransf;
-    int kerevalmethod;
-    int method;
-    int sort;
-    double tol;
-
-    test_options_t(int argc, char *argv[]) {
-        std::unordered_map<std::string, std::string> options_map;
-
-        while (true) {
-            int option_index = 0;
-
-            // clang-format off
+  char prec;
+  int type;
+  int n_runs;
+  int N[3];
+  int M;
+  int ntransf;
+  int kerevalmethod;
+  int method;
+  int sort;
+  double tol;
+
+  test_options_t(int argc, char *argv[]) {
+    std::unordered_map<std::string, std::string> options_map;
+
+    while (true) {
+      int option_index = 0;
+
+      // clang-format off
             static struct option long_options[] {
                 {"prec", required_argument, 0, 0},
                 {"type", required_argument, 0, 0},
@@ -57,251 +57,248 @@ struct test_options_t {
                 {"sort", required_argument, 0, 0},
                 {0, 0, 0, 0},
             };
-            // clang-format on
-
-            int c = getopt_long(argc, argv, "", long_options, &option_index);
-            if (c == -1)
-                break;
-
-            switch (c) {
-            case 0:
-                options_map[long_options[option_index].name] = optarg;
-                break;
-
-            default:
-                break;
-            }
-        }
-
-        prec = get_or(options_map, "prec", "f")[0];
-        type = std::stoi(get_or(options_map, "type", "1"));
-        n_runs = std::stoi(get_or(options_map, "n_runs", "10"));
-        N[0] = std::stof(get_or(options_map, "N1", "1E6"));
-        N[1] = std::stof(get_or(options_map, "N2", "1"));
-        N[2] = std::stof(get_or(options_map, "N3", "1"));
-        M = std::stof(get_or(options_map, "M", "2E6"));
-        ntransf = std::stoi(get_or(options_map, "ntransf", "1"));
-        method = std::stoi(get_or(options_map, "method", "1"));
-        kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "1"));
-        sort = std::stoi(get_or(options_map, "sort", "1"));
-        tol = std::stof(get_or(options_map, "tol", "1E-5"));
-    }
+      // clang-format on
+
+      int c = getopt_long(argc, argv, "", long_options, &option_index);
+      if (c == -1) break;
+
+      switch (c) {
+      case 0:
+        options_map[long_options[option_index].name] = optarg;
+        break;
 
-    friend std::ostream &operator<<(std::ostream &outs, const test_options_t &opts) {
-        return outs << "# prec = " << opts.prec << "\n"
-                    << "# type = " << opts.type << "\n"
-                    << "# n_runs = " << opts.n_runs << "\n"
-                    << "# N1 = " << opts.N[0] << "\n"
-                    << "# N2 = " << opts.N[1] << "\n"
-                    << "# N3 = " << opts.N[2] << "\n"
-                    << "# M = " << opts.M << "\n"
-                    << "# ntransf = " << opts.ntransf << "\n"
-                    << "# method = " << opts.method << "\n"
-                    << "# kerevalmethod = " << opts.kerevalmethod << "\n"
-                    << "# sort = " << opts.sort << "\n"
-                    << "# tol = " << opts.tol << "\n";
+      default:
+        break;
+      }
     }
+
+    prec          = get_or(options_map, "prec", "f")[0];
+    type          = std::stoi(get_or(options_map, "type", "1"));
+    n_runs        = std::stoi(get_or(options_map, "n_runs", "10"));
+    N[0]          = std::stof(get_or(options_map, "N1", "1E6"));
+    N[1]          = std::stof(get_or(options_map, "N2", "1"));
+    N[2]          = std::stof(get_or(options_map, "N3", "1"));
+    M             = std::stof(get_or(options_map, "M", "2E6"));
+    ntransf       = std::stoi(get_or(options_map, "ntransf", "1"));
+    method        = std::stoi(get_or(options_map, "method", "1"));
+    kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "1"));
+    sort          = std::stoi(get_or(options_map, "sort", "1"));
+    tol           = std::stof(get_or(options_map, "tol", "1E-5"));
+  }
+
+  friend std::ostream &operator<<(std::ostream &outs, const test_options_t &opts) {
+    return outs << "# prec = " << opts.prec << "\n"
+                << "# type = " << opts.type << "\n"
+                << "# n_runs = " << opts.n_runs << "\n"
+                << "# N1 = " << opts.N[0] << "\n"
+                << "# N2 = " << opts.N[1] << "\n"
+                << "# N3 = " << opts.N[2] << "\n"
+                << "# M = " << opts.M << "\n"
+                << "# ntransf = " << opts.ntransf << "\n"
+                << "# method = " << opts.method << "\n"
+                << "# kerevalmethod = " << opts.kerevalmethod << "\n"
+                << "# sort = " << opts.sort << "\n"
+                << "# tol = " << opts.tol << "\n";
+  }
 };
 
 struct CudaTimer {
-    CudaTimer() {}
+  CudaTimer() {}
 
-    ~CudaTimer() {
-        for (auto &event : start_)
-            cudaEventDestroy(event);
-        for (auto &event : stop_)
-            cudaEventDestroy(event);
-    }
+  ~CudaTimer() {
+    for (auto &event : start_) cudaEventDestroy(event);
+    for (auto &event : stop_) cudaEventDestroy(event);
+  }
 
-    void start() {
-        start_.push_back(cudaEvent_t{});
-        stop_.push_back(cudaEvent_t{});
+  void start() {
+    start_.push_back(cudaEvent_t{});
+    stop_.push_back(cudaEvent_t{});
 
-        cudaEventCreate(&start_.back());
-        cudaEventCreate(&stop_.back());
+    cudaEventCreate(&start_.back());
+    cudaEventCreate(&stop_.back());
 
-        cudaEventRecord(start_.back());
-    }
+    cudaEventRecord(start_.back());
+  }
 
-    void stop() { cudaEventRecord(stop_.back()); }
+  void stop() { cudaEventRecord(stop_.back()); }
 
-    void sync() {
-        for (auto &event : stop_)
-            cudaEventSynchronize(event);
-    }
+  void sync() {
+    for (auto &event : stop_) cudaEventSynchronize(event);
+  }
 
-    float mean() { return this->tot() / start_.size(); }
+  float mean() { return this->tot() / start_.size(); }
 
-    float std() {
-        float avg = this->mean();
+  float std() {
+    float avg = this->mean();
 
-        double var = 0.0;
-        for (int i = 0; i < start_.size(); ++i) {
-            float dt;
-            cudaEventElapsedTime(&dt, start_[i], stop_[i]);
-            var += (dt - avg) * (dt - avg);
-        }
-        var /= start_.size();
-
-        return sqrt(var);
+    double var = 0.0;
+    for (int i = 0; i < start_.size(); ++i) {
+      float dt;
+      cudaEventElapsedTime(&dt, start_[i], stop_[i]);
+      var += (dt - avg) * (dt - avg);
     }
+    var /= start_.size();
 
-    float tot() {
-        float dt_tot = 0.;
-        for (int i = 0; i < start_.size(); ++i) {
-            float dt;
-            cudaEventElapsedTime(&dt, start_[i], stop_[i]);
-            dt_tot += dt;
-        }
+    return sqrt(var);
+  }
 
-        return dt_tot;
+  float tot() {
+    float dt_tot = 0.;
+    for (int i = 0; i < start_.size(); ++i) {
+      float dt;
+      cudaEventElapsedTime(&dt, start_[i], stop_[i]);
+      dt_tot += dt;
     }
 
-    int count() { return start_.size(); }
+    return dt_tot;
+  }
+
+  int count() { return start_.size(); }
 
-    std::vector<cudaEvent_t> start_;
-    std::vector<cudaEvent_t> stop_;
+  std::vector<cudaEvent_t> start_;
+  std::vector<cudaEvent_t> stop_;
 };
 
-template <class F, class... Args>
-inline void timeit(F f, CudaTimer &timer, Args... args) {
-    timer.start();
-    f(args...);
-    timer.stop();
+template<class F, class... Args> inline void timeit(F f, CudaTimer &timer, Args... args) {
+  timer.start();
+  f(args...);
+  timer.stop();
 }
 
 void gpu_warmup() {
-    int nf1 = 100;
-    cufftHandle fftplan;
-    cufftPlan1d(&fftplan, nf1, CUFFT_Z2Z, 1);
-    thrust::device_vector<cufftDoubleComplex> in(nf1), out(nf1);
-    cufftExecZ2Z(fftplan, in.data().get(), out.data().get(), 1);
-    cudaDeviceSynchronize();
+  int nf1 = 100;
+  cufftHandle fftplan;
+  cufftPlan1d(&fftplan, nf1, CUFFT_Z2Z, 1);
+  thrust::device_vector<cufftDoubleComplex> in(nf1), out(nf1);
+  cufftExecZ2Z(fftplan, in.data().get(), out.data().get(), 1);
+  cudaDeviceSynchronize();
 }
 
-template <typename T>
-void run_test(test_options_t &test_opts) {
-    std::cout << test_opts;
-    const int ntransf = test_opts.ntransf;
-    const int64_t M = test_opts.M;
-    const int N = test_opts.N[0] * test_opts.N[1] * test_opts.N[2];
-    const int type = test_opts.type;
-    constexpr int iflag = 1;
-
-    thrust::host_vector<T> x(M * ntransf), y(M * ntransf), z(M * ntransf);
-    thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(N * ntransf);
-
-    thrust::device_vector<T> d_x(M * ntransf), d_y(M * ntransf), d_z(M * ntransf);
-    thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(N * ntransf);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
-    for (int64_t i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
-        y[i] = M_PI * randm11();
-        z[i] = M_PI * randm11();
-    }
-    for (int64_t i = M; i < M * ntransf; ++i) {
-        int64_t j = i % M;
-        x[i] = x[j];
-        y[i] = y[j];
-        z[i] = z[j];
+template<typename T> void run_test(test_options_t &test_opts) {
+  std::cout << test_opts;
+  const int ntransf   = test_opts.ntransf;
+  const int64_t M     = test_opts.M;
+  const int N         = test_opts.N[0] * test_opts.N[1] * test_opts.N[2];
+  const int type      = test_opts.type;
+  constexpr int iflag = 1;
+
+  thrust::host_vector<T> x(M * ntransf), y(M * ntransf), z(M * ntransf);
+  thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(N * ntransf);
+
+  thrust::device_vector<T> d_x(M * ntransf), d_y(M * ntransf), d_z(M * ntransf);
+  thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(N * ntransf);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int64_t i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    y[i] = M_PI * randm11();
+    z[i] = M_PI * randm11();
+  }
+  for (int64_t i = M; i < M * ntransf; ++i) {
+    int64_t j = i % M;
+    x[i]      = x[j];
+    y[i]      = y[j];
+    z[i]      = z[j];
+  }
+
+  if (type == 1) {
+    for (int i = 0; i < M * ntransf; i++) {
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
 
-    if (type == 1) {
-        for (int i = 0; i < M * ntransf; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-
-    } else if (type == 2) {
-        for (int i = 0; i < N * ntransf; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return;
+  } else if (type == 2) {
+    for (int i = 0; i < N * ntransf; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-
-    gpu_warmup();
-
-    cufinufft_opts opts;
-    int dim = 0;
-    for (int i = 0; i < 3; ++i)
-        dim = test_opts.N[i] > 1 ? i + 1 : dim;
-
-    cufinufft_default_opts(&opts);
-    opts.gpu_method = test_opts.method;
-    opts.gpu_sort = test_opts.sort;
-    opts.gpu_kerevalmeth = test_opts.kerevalmethod;
-
-    cufinufft_plan_t<T> *dplan;
-    CudaTimer h2d_timer, makeplan_timer, setpts_timer, execute_timer, d2h_timer, amortized_timer;
-    {
-        amortized_timer.start();
-        h2d_timer.start();
-        d_x = x, d_y = y, d_z = z;
-        if (type == 1)
-            d_c = c;
-        if (type == 2)
-            d_fk = fk;
-        h2d_timer.stop();
-
-        T *d_x_p = dim >= 1 ? d_x.data().get() : nullptr;
-        T *d_y_p = dim >= 2 ? d_y.data().get() : nullptr;
-        T *d_z_p = dim == 3 ? d_z.data().get() : nullptr;
-        cuda_complex<T> *d_c_p = (cuda_complex<T> *)d_c.data().get();
-        cuda_complex<T> *d_fk_p = (cuda_complex<T> *)d_fk.data().get();
-
-        timeit(cufinufft_makeplan_impl<T>, makeplan_timer, test_opts.type, dim, test_opts.N, iflag, ntransf,
-               test_opts.tol, &dplan, &opts);
-        for (int i = 0; i < test_opts.n_runs; ++i) {
-            timeit(cufinufft_setpts_impl<T>, setpts_timer, M, d_x_p, d_y_p, d_z_p, 0, nullptr, nullptr, nullptr, dplan);
-            timeit(cufinufft_execute_impl<T>, execute_timer, d_c_p, d_fk_p, dplan);
-        }
-
-        d2h_timer.start();
-        if (type == 1)
-            fk = d_fk;
-        if (type == 2)
-            c = d_c;
-        d2h_timer.stop();
-        
-        amortized_timer.stop();
-
-        h2d_timer.sync();
-        makeplan_timer.sync();
-        setpts_timer.sync();
-        execute_timer.sync();
-        d2h_timer.sync();
-        amortized_timer.sync();
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return;
+  }
+
+  gpu_warmup();
+
+  cufinufft_opts opts;
+  int dim = 0;
+  for (int i = 0; i < 3; ++i) dim = test_opts.N[i] > 1 ? i + 1 : dim;
+
+  cufinufft_default_opts(&opts);
+  opts.gpu_method      = test_opts.method;
+  opts.gpu_sort        = test_opts.sort;
+  opts.gpu_kerevalmeth = test_opts.kerevalmethod;
+
+  cufinufft_plan_t<T> *dplan;
+  CudaTimer h2d_timer, makeplan_timer, setpts_timer, execute_timer, d2h_timer,
+      amortized_timer;
+  {
+    amortized_timer.start();
+    h2d_timer.start();
+    d_x = x, d_y = y, d_z = z;
+    if (type == 1) d_c = c;
+    if (type == 2) d_fk = fk;
+    h2d_timer.stop();
+
+    T *d_x_p                = dim >= 1 ? d_x.data().get() : nullptr;
+    T *d_y_p                = dim >= 2 ? d_y.data().get() : nullptr;
+    T *d_z_p                = dim == 3 ? d_z.data().get() : nullptr;
+    cuda_complex<T> *d_c_p  = (cuda_complex<T> *)d_c.data().get();
+    cuda_complex<T> *d_fk_p = (cuda_complex<T> *)d_fk.data().get();
+
+    timeit(cufinufft_makeplan_impl<T>, makeplan_timer, test_opts.type, dim, test_opts.N,
+           iflag, ntransf, test_opts.tol, &dplan, &opts);
+    for (int i = 0; i < test_opts.n_runs; ++i) {
+      timeit(cufinufft_setpts_impl<T>, setpts_timer, M, d_x_p, d_y_p, d_z_p, 0, nullptr,
+             nullptr, nullptr, dplan);
+      timeit(cufinufft_execute_impl<T>, execute_timer, d_c_p, d_fk_p, dplan);
     }
 
-    const int64_t nupts_tot = M * test_opts.n_runs * ntransf;
-
-    printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n");
-    printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(),
-           h2d_timer.mean(), h2d_timer.std());
-    printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(), makeplan_timer.mean(),
-           makeplan_timer.std());
-    printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), setpts_timer.mean(), setpts_timer.std(),
-           nupts_tot * 1000 / setpts_timer.tot(), setpts_timer.tot() * 1E6 / nupts_tot);
-    printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), execute_timer.mean(),
-           execute_timer.std(), nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot);
-    printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(),
-           d2h_timer.mean(), d2h_timer.std());
-    printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), amortized_timer.mean(), amortized_timer.std(),
-           nupts_tot * 1000 / amortized_timer.tot(), amortized_timer.tot() * 1E6 / nupts_tot);
+    d2h_timer.start();
+    if (type == 1) fk = d_fk;
+    if (type == 2) c = d_c;
+    d2h_timer.stop();
+
+    amortized_timer.stop();
+
+    h2d_timer.sync();
+    makeplan_timer.sync();
+    setpts_timer.sync();
+    execute_timer.sync();
+    d2h_timer.sync();
+    amortized_timer.sync();
+  }
+
+  const int64_t nupts_tot = M * test_opts.n_runs * ntransf;
+
+  printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n");
+  printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(),
+         h2d_timer.mean(), h2d_timer.std());
+  printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(),
+         makeplan_timer.mean(), makeplan_timer.std());
+  printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(),
+         setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / setpts_timer.tot(),
+         setpts_timer.tot() * 1E6 / nupts_tot);
+  printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(),
+         execute_timer.mean(), execute_timer.std(),
+         nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot);
+  printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(),
+         d2h_timer.mean(), d2h_timer.std());
+  printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(),
+         amortized_timer.mean(), amortized_timer.std(),
+         nupts_tot * 1000 / amortized_timer.tot(),
+         amortized_timer.tot() * 1E6 / nupts_tot);
 }
 
 int main(int argc, char *argv[]) {
-    if (argc == 2 && (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h")) {
-        test_options_t default_opts(0, nullptr);
-        // clang-format off
+  if (argc == 2 && (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h")) {
+    test_options_t default_opts(0, nullptr);
+    // clang-format off
         std::cout << "Valid options:\n"
                      "    --prec <char>\n"
                      "           float or double precision. i.e. 'f' or 'd'\n"
@@ -347,15 +344,15 @@ int main(int argc, char *argv[]) {
                      "               0: do not sort the points\n"
                      "               1: sort the points\n"
                      "           default: " << default_opts.sort << "\n";
-        // clang-format on
-        return 0;
-    }
-    test_options_t opts(argc, argv);
+    // clang-format on
+    return 0;
+  }
+  test_options_t opts(argc, argv);
 
-    if (opts.prec == 'f')
-        run_test<float>(opts);
-    else if (opts.prec == 'd')
-        run_test<double>(opts);
+  if (opts.prec == 'f')
+    run_test<float>(opts);
+  else if (opts.prec == 'd')
+    run_test<double>(opts);
 
-    return 0;
+  return 0;
 }
diff --git a/perftest/guru_timing_test.cpp b/perftest/guru_timing_test.cpp
index 145d4f1ef..90055a36b 100644
--- a/perftest/guru_timing_test.cpp
+++ b/perftest/guru_timing_test.cpp
@@ -1,11 +1,8 @@
 #include <finufft/test_defs.h>
 // for sleep call
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
-#include<Windows.h>
-void sleep(unsigned long seconds)
-{
-    Sleep(seconds * 1000);
-}
+#include <Windows.h>
+void sleep(unsigned long seconds) { Sleep(seconds * 1000); }
 #else
 #include <unistd.h>
 #endif
@@ -14,11 +11,10 @@ using namespace finufft;
 using namespace finufft::utils;
 
 // forward declaration of helper to (repeatedly if needed) call finufft?d?
-double many_simple_calls(CPX *c,CPX *F,FLT*x, FLT*y, FLT*z,FINUFFT_PLAN plan);
-
+double many_simple_calls(CPX *c, CPX *F, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan);
 
 // --------------------------------------------------------------------------
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Timing-only tester for the guru interface, allowing control of many params
    and opts from the command line.
    It compares doing many transforms with same NU pts, with repeated calls to
@@ -37,10 +33,10 @@ int main(int argc, char* argv[])
    debug = 0: rel errors and overall timing
            1: timing breakdowns
            2: also spreading output
-   
+
    spread_scheme = 0: sequential maximally multithreaded spread/interp
                    1: parallel singlethreaded spread/interp, nested last batch
-   
+
    Example: guru_timing_test 100 1 2 100 100 0 1000000 1e-3 1 0 0 2 2.0
 
    The unused dimensions of Nmodes may be left as zero.
@@ -51,147 +47,159 @@ int main(int argc, char* argv[])
    added 2 extra args, 5/22/20. Moved to perftests 7/23/20.
 */
 {
-  double tsleep = 0.1;  // how long wait between tests to let FFTW settle (1.0?)
+  double tsleep = 0.1;         // how long wait between tests to let FFTW settle (1.0?)
   int ntransf, type, ndim;
-  BIGINT M, N1, N2, N3; // M = # srcs, N1,N2,N3= # modes in each dim
+  BIGINT M, N1, N2, N3;        // M = # srcs, N1,N2,N3= # modes in each dim
   double w, tol = 1e-6;
-  int isign = +1;             // choose which exponential sign to test
+  int isign = +1;              // choose which exponential sign to test
   finufft_opts opts;
-  FINUFFT_DEFAULT_OPTS(&opts);   // for guru interface
-  
+  FINUFFT_DEFAULT_OPTS(&opts); // for guru interface
+
   // Collect command line arguments ------------------------------------------
-  if (argc<8 || argc>14) {
-    fprintf(stderr,"Usage: guru_timing_test ntransf type ndim N1 N2 N3 Nsrc [tol [debug [spread_thread [maxbatchsize [spread_sort [upsampfac]]]]]]\n\teg:\tguru_timing_test 100 1 2 1e2 1e2 0 1e6 1e-3 1 0 0 2\n");
+  if (argc < 8 || argc > 14) {
+    fprintf(
+        stderr,
+        "Usage: guru_timing_test ntransf type ndim N1 N2 N3 Nsrc [tol [debug "
+        "[spread_thread [maxbatchsize [spread_sort "
+        "[upsampfac]]]]]]\n\teg:\tguru_timing_test 100 1 2 1e2 1e2 0 1e6 1e-3 1 0 0 2\n");
     return 1;
   }
-  sscanf(argv[1],"%d",&ntransf);
-  sscanf(argv[2],"%d",&type);
-  sscanf(argv[3],"%d",&ndim);
-  sscanf(argv[4],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[5],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[6],"%lf",&w); N3 = (BIGINT)w;
-  sscanf(argv[7],"%lf",&w); M = (BIGINT)w;
-  if (argc>8) sscanf(argv[8],"%lf",&tol);
-  if (argc>9) sscanf(argv[9],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;   // see output from spreader
-  if (argc>10) sscanf(argv[10], "%d", &opts.spread_thread);
-  if (argc>11) sscanf(argv[11], "%d", &opts.maxbatchsize); 
-  if (argc>12) sscanf(argv[12],"%d",&opts.spread_sort);
-  if (argc>13) { sscanf(argv[13],"%lf",&w); opts.upsampfac = (FLT)w; }
-
-  // Allocate and initialize input -------------------------------------------  
+  sscanf(argv[1], "%d", &ntransf);
+  sscanf(argv[2], "%d", &type);
+  sscanf(argv[3], "%d", &ndim);
+  sscanf(argv[4], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[5], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[6], "%lf", &w);
+  N3 = (BIGINT)w;
+  sscanf(argv[7], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 8) sscanf(argv[8], "%lf", &tol);
+  if (argc > 9) sscanf(argv[9], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 10) sscanf(argv[10], "%d", &opts.spread_thread);
+  if (argc > 11) sscanf(argv[11], "%d", &opts.maxbatchsize);
+  if (argc > 12) sscanf(argv[12], "%d", &opts.spread_sort);
+  if (argc > 13) {
+    sscanf(argv[13], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+
+  // Allocate and initialize input -------------------------------------------
   cout << scientific << setprecision(15);
-  N2 = (N2 == 0) ? 1 : N2;
-  N3 = (N3 == 0) ? 1 : N3;  
-  BIGINT N = N1*N2*N3;
-  
-  FLT* s = NULL;
-  FLT* t = NULL; 
-  FLT* u = NULL;
-  if (type == 3) {   // make target freq NU pts for type 3 (N of them)...
-    s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-    FLT S1 = (FLT)N1/2;            
+  N2       = (N2 == 0) ? 1 : N2;
+  N3       = (N3 == 0) ? 1 : N3;
+  BIGINT N = N1 * N2 * N3;
+
+  FLT *s = NULL;
+  FLT *t = NULL;
+  FLT *u = NULL;
+  if (type == 3) { // make target freq NU pts for type 3 (N of them)...
+    s      = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+    FLT S1 = (FLT)N1 / 2;
 #pragma omp parallel
     {
-      unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-      for (BIGINT k=0; k<N; ++k) {
-      s[k] = S1*(1.7 + randm11r(&se));    // note the offset, to test type 3.
-      }      
-      if(ndim > 1) {
-        t = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-        FLT S2 = (FLT)N2/2;
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-        for (BIGINT k=0; k<N; ++k) {
-          t[k] = S2*(-0.5 + randm11r(&se));  
+      unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+      for (BIGINT k = 0; k < N; ++k) {
+        s[k] = S1 * (1.7 + randm11r(&se)); // note the offset, to test type 3.
+      }
+      if (ndim > 1) {
+        t      = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+        FLT S2 = (FLT)N2 / 2;
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+        for (BIGINT k = 0; k < N; ++k) {
+          t[k] = S2 * (-0.5 + randm11r(&se));
         }
-      }      
-      if(ndim > 2) {
-        u = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (3-cmpt)
-        FLT S3 = (FLT)N3/2;
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-        for (BIGINT k=0; k<N; ++k) {
-          u[k] = S3*(0.9 + randm11r(&se));  
+      }
+      if (ndim > 2) {
+        u      = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (3-cmpt)
+        FLT S3 = (FLT)N3 / 2;
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+        for (BIGINT k = 0; k < N; ++k) {
+          u[k] = S3 * (0.9 + randm11r(&se));
         }
       }
     }
   }
-  
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf);   // mode ampls  
-
-  FLT *x = (FLT *)malloc(sizeof(FLT)*M), *y=NULL, *z=NULL;  // NU pts x coords
-  if(ndim > 1)
-    y = (FLT *)malloc(sizeof(FLT)*M);        // NU pts y coords
-  if(ndim > 2)
-    z = (FLT *)malloc(sizeof(FLT)*M);        // NU pts z coords
+
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf);             // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf);             // mode ampls
+
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M), *y = NULL, *z = NULL; // NU pts x coords
+  if (ndim > 1) y = (FLT *)malloc(sizeof(FLT) * M);              // NU pts y coords
+  if (ndim > 2) z = (FLT *)malloc(sizeof(FLT) * M);              // NU pts z coords
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      if(y)
-	y[j] = M_PI*randm11r(&se);
-      if(z)
-	z[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      if (y) y[j] = M_PI * randm11r(&se);
+      if (z) z[j] = M_PI * randm11r(&se);
     }
-#pragma omp for schedule(dynamic,TEST_RANDCHUNK)
-    for(BIGINT i = 0; i<ntransf*M; i++)       // random strengths
-	c[i] = crandm11r(&se);
+#pragma omp for schedule(dynamic, TEST_RANDCHUNK)
+    for (BIGINT i = 0; i < ntransf * M; i++) // random strengths
+      c[i] = crandm11r(&se);
   }
 
   // Andrea found the following are needed to get reliable independent timings:
   FFTW_CLEANUP();
   FFTW_CLEANUP_THREADS();
   FFTW_FORGET_WISDOM();
-  //std::this_thread::sleep_for(std::chrono::seconds(1));
+  // std::this_thread::sleep_for(std::chrono::seconds(1));
   sleep(tsleep);
 
-  printf("FINUFFT %dd%d use guru interface to do %d calls together:-------------------\n",ndim,type,ntransf);
-  FINUFFT_PLAN plan;                  // instantiate a finufft_plan
-  finufft::utils::CNTime timer; timer.start();        // Guru Step 1
-  BIGINT n_modes[3] = {N1,N2,N3};     // #modes per dimension (ignored for t3)
+  printf("FINUFFT %dd%d use guru interface to do %d calls together:-------------------\n",
+         ndim, type, ntransf);
+  FINUFFT_PLAN plan;                // instantiate a finufft_plan
+  finufft::utils::CNTime timer;
+  timer.start();                    // Guru Step 1
+  BIGINT n_modes[3] = {N1, N2, N3}; // #modes per dimension (ignored for t3)
   int ier = FINUFFT_MAKEPLAN(type, ndim, n_modes, isign, ntransf, tol, &plan, &opts);
   // (NB: the opts struct can no longer be modified with effect!)
   double plan_t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else {
-    if (type!=3)
+    if (type != 3)
       printf("\tplan, for %lld modes: \t\t%.3g s\n", (long long)N, plan_t);
     else
       printf("\tplan:\t\t\t\t\t%.3g s\n", plan_t);
   }
-  
-  timer.restart();                    // Guru Step 2
-  ier = FINUFFT_SETPTS(plan, M, x, y, z, N, s, t, u); //(t1,2: N,s,t,u ignored)
+
+  timer.restart();                                              // Guru Step 2
+  ier           = FINUFFT_SETPTS(plan, M, x, y, z, N, s, t, u); //(t1,2: N,s,t,u ignored)
   double sort_t = timer.elapsedsec();
   if (ier) {
-    printf("error (ier=%d)!\n",ier);
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else {
-    if (type!=3)
+    if (type != 3)
       printf("\tsetpts for %lld NU pts: \t\t%.3g s\n", (long long)M, sort_t);
     else
-      printf("\tsetpts for %lld + %lld NU pts: \t%.3g s\n", (long long)M, (long long)N, sort_t);
+      printf("\tsetpts for %lld + %lld NU pts: \t%.3g s\n", (long long)M, (long long)N,
+             sort_t);
   }
-  
-  timer.restart();                     // Guru Step 3
-  ier = FINUFFT_EXECUTE(plan,c,F);
-  double exec_t=timer.elapsedsec();
+
+  timer.restart(); // Guru Step 3
+  ier           = FINUFFT_EXECUTE(plan, c, F);
+  double exec_t = timer.elapsedsec();
   if (ier) {
-    printf("error (ier=%d)!\n",ier);
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
     printf("\texec \t\t\t\t\t%.3g s\n", exec_t);
 
   double totalTime = plan_t + sort_t + exec_t;
-  if (type!=3)
-    printf("ntr=%d: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf, (long long)M,(long long)N, totalTime, ntransf*M/totalTime);
+  if (type != 3)
+    printf("ntr=%d: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N, totalTime, ntransf * M / totalTime);
   else
-    printf("ntr=%d: %lld NU pts to %lld NU pts in %.3g s \t%.3g tot NU pts/s\n", ntransf, (long long)M,(long long)N, totalTime, ntransf*(N+M)/totalTime);
+    printf("ntr=%d: %lld NU pts to %lld NU pts in %.3g s \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, totalTime, ntransf * (N + M) / totalTime);
 
   // Comparing timing results with repeated calls to corresponding finufft function...
 
@@ -199,40 +207,38 @@ int main(int argc, char* argv[])
   // by Andrea Malleo, but in this case we need to access the plan later
   // for many_simple_calls() to work, so we cannot do FFTW cleanup without
   // apparently causing segfault :(. So we skip them.
-  //FFTW_CLEANUP();
-  //FFTW_CLEANUP_THREADS();
-  //FFTW_FORGET_WISDOM();
-  
-  //std::this_thread::sleep_for(std::chrono::seconds(1)); if c++11 is allowed
-  sleep(tsleep); //sleep for one second using linux sleep call
-  
-  
-  printf("Compare speed of repeated calls to simple interface:------------------------\n");
+  // FFTW_CLEANUP();
+  // FFTW_CLEANUP_THREADS();
+  // FFTW_FORGET_WISDOM();
+
+  // std::this_thread::sleep_for(std::chrono::seconds(1)); if c++11 is allowed
+  sleep(tsleep); // sleep for one second using linux sleep call
+
+  printf(
+      "Compare speed of repeated calls to simple interface:------------------------\n");
   // this used to actually call Alex's old (v1.1) src/finufft?d.cpp routines.
   // Since we don't want to ship those, we now call the simple interfaces.
-  
-  double simpleTime = many_simple_calls(c,F, x, y, z, plan);
-  if (isnan(simpleTime))
-    return 1;
-  
-  if (type!=3)
-    printf("%d of:\t%lld NU pts to %lld modes in %.3g s   \t%.3g NU pts/s\n",
-           ntransf,(long long)M,(long long)N, simpleTime, ntransf*M/simpleTime);
+
+  double simpleTime = many_simple_calls(c, F, x, y, z, plan);
+  if (isnan(simpleTime)) return 1;
+
+  if (type != 3)
+    printf("%d of:\t%lld NU pts to %lld modes in %.3g s   \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N, simpleTime, ntransf * M / simpleTime);
   else
-    printf("%d of:\t%lld NU pts to %lld NU pts in %.3g s  \t%.3g tot NU pts/s\n",
-           ntransf,(long long)M,(long long)N, simpleTime, ntransf*(M+N)/simpleTime);
-  printf("\tspeedup \t T_finufft%dd%d_simple / T_finufft%dd%d = %.3g\n",ndim,type,
-         ndim, type, simpleTime/totalTime);
+    printf("%d of:\t%lld NU pts to %lld NU pts in %.3g s  \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, simpleTime, ntransf * (M + N) / simpleTime);
+  printf("\tspeedup \t T_finufft%dd%d_simple / T_finufft%dd%d = %.3g\n", ndim, type, ndim,
+         type, simpleTime / totalTime);
 
-  
-  FINUFFT_DESTROY(plan);              // Guru Step 4
+  FINUFFT_DESTROY(plan); // Guru Step 4
   // (must be done *after* many_simple_calls, which sneaks a look at the plan!)
   // however, segfaults, maybe because plan->opts.debug changed?
-  
+
   //---------------------------- Free Memory (no need to test if NULL)
   free(F);
   free(c);
-  free(x); 
+  free(x);
   free(y);
   free(z);
   free(s);
@@ -241,7 +247,6 @@ int main(int argc, char* argv[])
   return 0;
 }
 
-
 // -------------------------------- HELPER FUNCS ----------------------------
 
 double finufftFunnel(CPX *cStart, CPX *fStart, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan)
@@ -253,156 +258,161 @@ double finufftFunnel(CPX *cStart, CPX *fStart, FLT *x, FLT *y, FLT *z, FINUFFT_P
    Malleo 2019; xyz passed in by Barnett 5/26/20 to prevent X_orig fields.
 */
 {
-  finufft::utils::CNTime timer; timer.start();
-  int ier = 0;
-  double t = 0;
-  double fail = NAN;                  // dummy code for failure
-  finufft_opts* popts = &(plan->opts);   // opts ptr, as v1.2 simple calls need
-  switch (plan->dim){
-    
-  case 1:                    // 1D
-    switch (plan->type){
+  finufft::utils::CNTime timer;
+  timer.start();
+  int ier             = 0;
+  double t            = 0;
+  double fail         = NAN;           // dummy code for failure
+  finufft_opts *popts = &(plan->opts); // opts ptr, as v1.2 simple calls need
+  switch (plan->dim) {
+
+  case 1: // 1D
+    switch (plan->type) {
 
     case 1:
       timer.restart();
-      ier = FINUFFT1D1(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT1D1(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart,
+                       popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 2:
       timer.restart();
-      ier = FINUFFT1D2(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT1D2(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->ms, fStart,
+                       popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 3:
       timer.restart();
-      ier = FINUFFT1D3(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->nk, plan->S, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT1D3(plan->nj, x, cStart, plan->fftSign, plan->tol, plan->nk, plan->S,
+                       fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     default:
-      return fail; 
+      return fail;
     }
 
-  case 2:                    // 2D
-    switch(plan->type){
-      
+  case 2: // 2D
+    switch (plan->type) {
+
     case 1:
       timer.restart();
-      ier = FINUFFT2D1(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->ms, plan->mt, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT2D1(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->ms,
+                       plan->mt, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 2:
       timer.restart();
-      ier = FINUFFT2D2(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->ms, plan->mt,
-     		       fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT2D2(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->ms,
+                       plan->mt, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
+        return t;
 
     case 3:
       timer.restart();
-      ier = FINUFFT2D3(plan->nj, x,y, cStart, plan->fftSign, plan->tol, plan->nk, plan->S, plan->T,
-                       fStart, popts); 
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT2D3(plan->nj, x, y, cStart, plan->fftSign, plan->tol, plan->nk,
+                       plan->S, plan->T, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     default:
       return fail;
     }
 
-  case 3:                    // 3D
-    switch(plan->type){
+  case 3: // 3D
+    switch (plan->type) {
 
     case 1:
       timer.restart();
-      ier = FINUFFT3D1(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol,
-                       plan->ms, plan->mt, plan->mu, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT3D1(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->ms,
+                       plan->mt, plan->mu, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 2:
       timer.restart();
-      ier = FINUFFT3D2(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol,
-                       plan->ms, plan->mt, plan->mu, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT3D2(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->ms,
+                       plan->mt, plan->mu, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
-      
+        return t;
+
     case 3:
       timer.restart();
-      ier = FINUFFT3D3(plan->nj, x,y,z, cStart, plan->fftSign, plan->tol,
-                       plan->nk, plan->S, plan->T, plan->U, fStart, popts);
-      t = timer.elapsedsec();
-      if(ier)
-	return fail;
+      ier = FINUFFT3D3(plan->nj, x, y, z, cStart, plan->fftSign, plan->tol, plan->nk,
+                       plan->S, plan->T, plan->U, fStart, popts);
+      t   = timer.elapsedsec();
+      if (ier)
+        return fail;
       else
-	return t;
+        return t;
 
-    default:                   // invalid type
+    default: // invalid type
       return fail;
     }
 
-  default:                     // invalid dimension
+  default: // invalid dimension
     return fail;
   }
 }
 
-double many_simple_calls(CPX *c,CPX *F, FLT* x, FLT* y, FLT* z, FINUFFT_PLAN plan)
+double many_simple_calls(CPX *c, CPX *F, FLT *x, FLT *y, FLT *z, FINUFFT_PLAN plan)
 /* A unified interface to all of the simple interfaces, with a loop over
    many such transforms. Returns total time reported by the transforms.
    (Used to call pre-v1.2 single implementations in finufft, via runOldFinufft.
    The repo no longer contains those implementations, which used to be in a
    subdirectory.)
 */
-{  
-    CPX *cStart;
-    CPX *fStart;
-
-    double time = 0;
-    double temp = 0;;
-    
-    for(int k = 0; k < plan->ntrans; k++){
-      cStart = c + plan->nj*k;
-      fStart = F + plan->ms*plan->mt*plan->mu*k;
-      
-      //printf("k=%d, debug=%d.................\n",k, plan->opts.debug);      
-      if(k != 0) {                     // prevent massive debug output
-	plan->opts.debug = 0;
-	plan->opts.spread_debug = 0;
-      }
-        
-      temp = finufftFunnel(cStart,fStart, x, y,z,plan);
-      if (isnan(temp)) {
-	fprintf(stderr,"[%s] Funnel call to finufft failed!\n",__func__); 
-        return NAN;
-      }
-      else
-	time += temp;
+{
+  CPX *cStart;
+  CPX *fStart;
+
+  double time = 0;
+  double temp = 0;
+  ;
+
+  for (int k = 0; k < plan->ntrans; k++) {
+    cStart = c + plan->nj * k;
+    fStart = F + plan->ms * plan->mt * plan->mu * k;
+
+    // printf("k=%d, debug=%d.................\n",k, plan->opts.debug);
+    if (k != 0) { // prevent massive debug output
+      plan->opts.debug        = 0;
+      plan->opts.spread_debug = 0;
     }
-    return time;
+
+    temp = finufftFunnel(cStart, fStart, x, y, z, plan);
+    if (isnan(temp)) {
+      fprintf(stderr, "[%s] Funnel call to finufft failed!\n", __func__);
+      return NAN;
+    } else
+      time += temp;
+  }
+  return time;
 }
diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp
index c6776cf0e..0f2c9d0bb 100644
--- a/perftest/manysmallprobs.cpp
+++ b/perftest/manysmallprobs.cpp
@@ -10,14 +10,14 @@ using namespace finufft::utils;
 #include <stdlib.h>
 using namespace std;
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* What is small-problem cost of FINUFFT library from C++, using plain
    arrays of C++ complex numbers?  Barnett 10/31/17.
    for Xi Chen question. Updated to also demo guru interface and compare speed.
    6/7/22 made deterministic changes so check answer matches both ways.
 
-   g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs  -lfftw3 -lfftw3_omp -lm
-   # multithreaded is much slower, due to overhead of starting threads?...
+   g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs  -lfftw3
+   -lfftw3_omp -lm # multithreaded is much slower, due to overhead of starting threads?...
    export OMP_NUM_THREADS=1
    time ./manysmallprobs
 
@@ -26,54 +26,64 @@ int main(int argc, char* argv[])
 
    But why is multi-thread so much slower? (thread start-up time?)
 */
-{  
-  int M = 2e2;            // number of nonuniform points
-  int N = 2e2;            // number of modes
-  int reps = 2e4;         // how many repetitions
-  double acc = 1e-6;      // desired accuracy
-  
-  complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
+{
+  int M      = 2e2;                              // number of nonuniform points
+  int N      = 2e2;                              // number of modes
+  int reps   = 2e4;                              // how many repetitions
+  double acc = 1e-6;                             // desired accuracy
+
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
   int ier;
-  
+
   // generate some random nonuniform points (x) and complex strengths (c):
-  double *x = (double *)malloc(sizeof(double)*M);
-  complex<double>* c = (complex<double>*)malloc(sizeof(complex<double>)*M);
-  for (int j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi]
-    c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  double *x          = (double *)malloc(sizeof(double) * M);
+  complex<double> *c = (complex<double> *)malloc(sizeof(complex<double>) * M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi]
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
   // allocate output array for the Fourier modes:
-  complex<double>* F = (complex<double>*)malloc(sizeof(complex<double>)*N);
+  complex<double> *F = (complex<double> *)malloc(sizeof(complex<double>) * N);
 
   printf("repeatedly calling the simple interface: --------------------- \n");
-  finufft::utils::CNTime timer; timer.start();
-  for (int r=0;r<reps;++r) {    // call the NUFFT (with iflag=+1):
-    //printf("rep %d\n",r);
-    x[0] = M_PI*(-1.0 + 2*(double)r/(double)reps);   // one source jiggles around
-    c[0] = (1.0 + I) * (double)r/(double)reps;       // one coeff also jiggles
-    ier = finufft1d1(M,x,c,+1,acc,N,F,NULL);
+  finufft::utils::CNTime timer;
+  timer.start();
+  for (int r = 0; r < reps; ++r) { // call the NUFFT (with iflag=+1):
+    // printf("rep %d\n",r);
+    x[0] = M_PI * (-1.0 + 2 * (double)r / (double)reps); // one source jiggles around
+    c[0] = (1.0 + I) * (double)r / (double)reps;         // one coeff also jiggles
+    ier  = finufft1d1(M, x, c, +1, acc, N, F, NULL);
   }
   // (note this can't use the many-vectors interface since the NU change)
-  complex<double> y=F[0];    // actually use the data so not optimized away
-  printf("%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",reps,timer.elapsedsec(),reps*M/timer.elapsedsec(),ier,real(y),imag(y));
+  complex<double> y = F[0]; // actually use the data so not optimized away
+  printf(
+      "%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",
+      reps, timer.elapsedsec(), reps * M / timer.elapsedsec(), ier, real(y), imag(y));
 
   printf("repeatedly executing via the guru interface: -------------------\n");
   timer.restart();
-  finufft_plan plan; finufft_opts opts; finufft_default_opts(&opts);
-  opts.debug = 0;
-  int64_t Ns[]={N,1,1};
-  int ntransf = 1;    // since we do one at a time (neq reps)
-  finufft_makeplan(1,1,Ns,+1,ntransf,acc,&plan,&opts);
-  for (int r=0;r<reps;++r) {    // set the pts and execute
-    x[0] = M_PI*(-1.0 + 2*(double)r/(double)reps);   // one source jiggles around
+  finufft_plan plan;
+  finufft_opts opts;
+  finufft_default_opts(&opts);
+  opts.debug   = 0;
+  int64_t Ns[] = {N, 1, 1};
+  int ntransf  = 1;                // since we do one at a time (neq reps)
+  finufft_makeplan(1, 1, Ns, +1, ntransf, acc, &plan, &opts);
+  for (int r = 0; r < reps; ++r) { // set the pts and execute
+    x[0] = M_PI * (-1.0 + 2 * (double)r / (double)reps); // one source jiggles around
     // (of course if most sources *were* in fact fixed, use ZGEMM for them!)
     finufft_setpts(plan, M, x, NULL, NULL, 0, NULL, NULL, NULL);
-    c[0] = (1.0 + I) * (double)r/(double)reps;       // one coeff also jiggles
-    ier = finufft_execute(plan, c, F);
+    c[0] = (1.0 + I) * (double)r / (double)reps; // one coeff also jiggles
+    ier  = finufft_execute(plan, c, F);
   }
   finufft_destroy(plan);
   y = F[0];
-  printf("%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",reps,timer.elapsedsec(),reps*M/timer.elapsedsec(),ier,real(y),imag(y));
-  free(x); free(c); free(F);
+  printf(
+      "%d reps of 1d1 done in %.3g s,\t%.3g NU pts/s\t(last ier=%d)\nF[0]=%.6g + %.6gi\n",
+      reps, timer.elapsedsec(), reps * M / timer.elapsedsec(), ier, real(y), imag(y));
+  free(x);
+  free(c);
+  free(F);
   return ier;
 }
diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp
index ab345035c..9b560a25e 100644
--- a/perftest/spreadtestnd.cpp
+++ b/perftest/spreadtestnd.cpp
@@ -1,22 +1,28 @@
-#include <finufft/spreadinterp.h>
 #include <finufft/defs.h>
+#include <finufft/spreadinterp.h>
 #include <finufft/utils.h>
 #include <finufft/utils_precindep.h>
 
-#include <vector>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
 using namespace finufft::spreadinterp;
-using namespace finufft::utils;              // for timer
+using namespace finufft::utils; // for timer
 
-void usage()
-{
-  printf("usage: spreadtestnd dims [M N [tol [sort [flags [debug [kerpad [kerevalmeth [upsampfac]]]]]]]]\n\twhere dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform pts\n\ttol=requested accuracy\n\tsort=0 (don't sort NU pts), 1 (do), or 2 (maybe sort; default)\n\tflags: expert timing flags, 0 is default (see spreadinterp.h)\n\tdebug=0 (less text out), 1 (more), 2 (lots)\n\tkerpad=0 (no pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), 1 (Horner ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestnd 1 1e6 1e6 1e-6 2 0 1\n");
+void usage() {
+  printf("usage: spreadtestnd dims [M N [tol [sort [flags [debug [kerpad [kerevalmeth "
+         "[upsampfac]]]]]]]]\n\twhere dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform "
+         "pts\n\ttol=requested accuracy\n\tsort=0 (don't sort NU pts), 1 (do), or 2 "
+         "(maybe sort; default)\n\tflags: expert timing flags, 0 is default (see "
+         "spreadinterp.h)\n\tdebug=0 (less text out), 1 (more), 2 (lots)\n\tkerpad=0 (no "
+         "pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), 1 "
+         "(Horner ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestnd "
+         "1 1e6 1e6 1e-6 2 0 1\n");
 }
 
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
 /* Test executable for the 1D, 2D, or 3D C++ spreader, both directions.
  * It checks speed, and basic correctness via the grid sum of the result.
  * See usage() for usage.  Note it currently tests only pirange=0, which is not
@@ -25,7 +31,8 @@ int main(int argc, char* argv[])
  * Example: spreadtestnd 3 8e6 8e6 1e-6 2 0 1
  *
  * Compilation (also check ../makefile):
- *    g++ spreadtestnd.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestnd -fPIC -Ofast -funroll-loops -fopenmp
+ *    g++ spreadtestnd.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestnd -fPIC
+ * -Ofast -funroll-loops -fopenmp
  *
  * Magland; expanded by Barnett 1/14/17. Better cmd line args 3/13/17
  * indep setting N 3/27/17. parallel rand() & sort flag 3/28/17
@@ -34,192 +41,258 @@ int main(int argc, char* argv[])
  * Barbone, removed pirange 05/09/24.
  */
 {
-  int d = 3;            // Cmd line args & their defaults:  default #dims
-  double w, tol = 1e-6; // default (eg 1e-6 has nspread=7)
-  BIGINT M = 1e6;       // default # NU pts
-  BIGINT roughNg = 1e6; // default # U pts
-  int sort = 2;         // spread_sort
-  int flags = 0;        // default
-  int debug = 0;        // default
-  int kerpad = 0;       // default
-  int kerevalmeth = 1;  // default: Horner
-  FLT upsampfac = 2.0;  // standard
-  
-  if (argc<2 || argc==3 || argc>11) {
-    usage(); return (argc>1);
-  }
-  sscanf(argv[1],"%d",&d);
-  if (d<1 || d>3) {
-    printf("d must be 1, 2 or 3!\n"); usage(); return 1;
-  }
-  if (argc>2) {
-    sscanf(argv[2],"%lf",&w); M = (BIGINT)w;       // to read "1e6" right!
-    if (M<1) {
-      printf("M (# NU pts) must be positive!\n"); usage(); return 1;
+  int d = 3;             // Cmd line args & their defaults:  default #dims
+  double w, tol = 1e-6;  // default (eg 1e-6 has nspread=7)
+  BIGINT M        = 1e6; // default # NU pts
+  BIGINT roughNg  = 1e6; // default # U pts
+  int sort        = 2;   // spread_sort
+  int flags       = 0;   // default
+  int debug       = 0;   // default
+  int kerpad      = 0;   // default
+  int kerevalmeth = 1;   // default: Horner
+  FLT upsampfac   = 2.0; // standard
+
+  if (argc < 2 || argc == 3 || argc > 11) {
+    usage();
+    return (argc > 1);
+  }
+  sscanf(argv[1], "%d", &d);
+  if (d < 1 || d > 3) {
+    printf("d must be 1, 2 or 3!\n");
+    usage();
+    return 1;
+  }
+  if (argc > 2) {
+    sscanf(argv[2], "%lf", &w);
+    M = (BIGINT)w; // to read "1e6" right!
+    if (M < 1) {
+      printf("M (# NU pts) must be positive!\n");
+      usage();
+      return 1;
     }
-    sscanf(argv[3],"%lf",&w); roughNg = (BIGINT)w;
-    if (roughNg<1) {
-      printf("N (# U pts) must be positive!\n"); usage(); return 1;
+    sscanf(argv[3], "%lf", &w);
+    roughNg = (BIGINT)w;
+    if (roughNg < 1) {
+      printf("N (# U pts) must be positive!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>4) sscanf(argv[4],"%lf",&tol);
-  if (argc>5) {
-    sscanf(argv[5],"%d",&sort);
-    if ((sort!=0) && (sort!=1) && (sort!=2)) {
-      printf("sort must be 0, 1 or 2!\n"); usage(); return 1;
+  if (argc > 4) sscanf(argv[4], "%lf", &tol);
+  if (argc > 5) {
+    sscanf(argv[5], "%d", &sort);
+    if ((sort != 0) && (sort != 1) && (sort != 2)) {
+      printf("sort must be 0, 1 or 2!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>6)
-    sscanf(argv[6],"%d",&flags);
-  if (argc>7) {
-    sscanf(argv[7],"%d",&debug);
-    if ((debug<0) || (debug>2)) {
-      printf("debug must be 0, 1 or 2!\n"); usage(); return 1;
+  if (argc > 6) sscanf(argv[6], "%d", &flags);
+  if (argc > 7) {
+    sscanf(argv[7], "%d", &debug);
+    if ((debug < 0) || (debug > 2)) {
+      printf("debug must be 0, 1 or 2!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>8) {
-    sscanf(argv[8],"%d",&kerpad);
-    if ((kerpad<0) || (kerpad>1)) {
-      printf("kerpad must be 0 or 1!\n"); usage(); return 1;
+  if (argc > 8) {
+    sscanf(argv[8], "%d", &kerpad);
+    if ((kerpad < 0) || (kerpad > 1)) {
+      printf("kerpad must be 0 or 1!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>9) {
-    sscanf(argv[9],"%d",&kerevalmeth);
-    if ((kerevalmeth<0) || (kerevalmeth>1)) {
-      printf("kerevalmeth must be 0 or 1!\n"); usage(); return 1;
+  if (argc > 9) {
+    sscanf(argv[9], "%d", &kerevalmeth);
+    if ((kerevalmeth < 0) || (kerevalmeth > 1)) {
+      printf("kerevalmeth must be 0 or 1!\n");
+      usage();
+      return 1;
     }
   }
-  if (argc>10) {
-    sscanf(argv[10],"%lf",&w); upsampfac = (FLT)w;
-    if (upsampfac<=1.0) {
-      printf("upsampfac must be >1.0!\n"); usage(); return 1;
+  if (argc > 10) {
+    sscanf(argv[10], "%lf", &w);
+    upsampfac = (FLT)w;
+    if (upsampfac <= 1.0) {
+      printf("upsampfac must be >1.0!\n");
+      usage();
+      return 1;
     }
   }
 
-  int dodir1 = true;                        // control if dir=1 tested at all
-  BIGINT N = (BIGINT)round(pow(roughNg,1.0/d));     // Fourier grid size per dim
-  BIGINT Ng = (BIGINT)pow(N,d);                     // actual total grid points
-  BIGINT N2 = (d>=2) ? N : 1, N3 = (d==3) ? N : 1;  // the y and z grid sizes
-  std::vector<FLT> kx(M),ky(1),kz(1),d_nonuniform(2*M);    // NU, Re & Im
-  if (d>1) ky.resize(M);                           // only alloc needed coords
-  if (d>2) kz.resize(M);
-  std::vector<FLT> d_uniform(2*Ng);                        // Re and Im
+  int dodir1 = true;                                   // control if dir=1 tested at all
+  BIGINT N   = (BIGINT)round(pow(roughNg, 1.0 / d));   // Fourier grid size per dim
+  BIGINT Ng  = (BIGINT)pow(N, d);                      // actual total grid points
+  BIGINT N2 = (d >= 2) ? N : 1, N3 = (d == 3) ? N : 1; // the y and z grid sizes
+  std::vector<FLT> kx(M), ky(1), kz(1), d_nonuniform(2 * M); // NU, Re & Im
+  if (d > 1) ky.resize(M);                                   // only alloc needed coords
+  if (d > 2) kz.resize(M);
+  std::vector<FLT> d_uniform(2 * Ng);                        // Re and Im
 
   finufft_spread_opts opts;
-  int ier_set = setup_spreader(opts,(FLT)tol,upsampfac,kerevalmeth,debug,1,d);
-  if (ier_set>1) {       // exit gracefully if can't set up.
-    printf("error when setting up spreader (ier_set=%d)!\n",ier_set);
+  int ier_set = setup_spreader(opts, (FLT)tol, upsampfac, kerevalmeth, debug, 1, d);
+  if (ier_set > 1) { // exit gracefully if can't set up.
+    printf("error when setting up spreader (ier_set=%d)!\n", ier_set);
     return ier_set;
   }
-  opts.debug = debug;   // print more diagnostics?
-  opts.sort = sort;
-  opts.flags = flags;
-  opts.kerpad = kerpad;
-  opts.upsampfac = upsampfac;
-  opts.nthreads = 0;  // max # threads used, or 0 to use what's avail
+  opts.debug        = debug; // print more diagnostics?
+  opts.sort         = sort;
+  opts.flags        = flags;
+  opts.kerpad       = kerpad;
+  opts.upsampfac    = upsampfac;
+  opts.nthreads     = 0; // max # threads used, or 0 to use what's avail
   opts.sort_threads = 0;
-  //opts.max_subproblem_size = 1e5;
+  // opts.max_subproblem_size = 1e5;
   FLT maxerr, ansmod;
-  
+
   // spread a single source, only for reference accuracy check...
-  opts.spread_direction=1;
-  d_nonuniform[0] = 1.0; d_nonuniform[1] = 0.0;   // unit strength
-  kx[0] = ky[0] = kz[0] = 0.0;                    // at center (probably doesn't matter); domain is [-pi,pi)^d
-  int ier = spreadinterp(N,N2,N3,d_uniform.data(),1,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts);          // vector::data officially C++11 but works
-  if (ier!=0) {
-    printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n",ier);
+  opts.spread_direction = 1;
+  d_nonuniform[0]       = 1.0;
+  d_nonuniform[1]       = 0.0; // unit strength
+  kx[0] = ky[0] = kz[0] = 0.0; // at center (probably doesn't matter); domain is
+                               // [-pi,pi)^d
+  int ier = spreadinterp(N,
+                         N2,
+                         N3,
+                         d_uniform.data(),
+                         1,
+                         kx.data(),
+                         ky.data(),
+                         kz.data(),
+                         d_nonuniform.data(),
+                         opts); // vector::data officially C++11 but works
+  if (ier != 0) {
+    printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n", ier);
     return ier;
   }
-  FLT kersumre = 0.0, kersumim = 0.0;  // sum kernel on uniform grid
-  for (BIGINT i=0;i<Ng;++i) {
-    kersumre += d_uniform[2*i]; 
-    kersumim += d_uniform[2*i+1];    // in case the kernel isn't real!
+  FLT kersumre = 0.0, kersumim = 0.0; // sum kernel on uniform grid
+  for (BIGINT i = 0; i < Ng; ++i) {
+    kersumre += d_uniform[2 * i];
+    kersumim += d_uniform[2 * i + 1]; // in case the kernel isn't real!
   }
 
   // now do the large-scale test w/ random sources..
   printf("making random data...\n");
-  FLT strre = 0.0, strim = 0.0;          // also sum the strengths
+  FLT strre = 0.0, strim = 0.0; // also sum the strengths
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,1000000) reduction(+:strre,strim)
-    for (BIGINT i=0; i<M; ++i) {
-      kx[i]=randm11r(&se)*3*M_PI;
-      //kx[i]=2.0*kx[i] - 50.0;      //// to test folding within +-1 period
-      if (d>1) ky[i]=randm11r(&se)*3*M_PI;      // only fill needed coords
-      if (d>2) kz[i]=randm11r(&se)*3*M_PI;
-      d_nonuniform[i*2]=randm11r(&se);
-      d_nonuniform[i*2+1]=randm11r(&se);
-      strre += d_nonuniform[2*i]; 
-      strim += d_nonuniform[2*i+1];
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, 1000000) reduction(+ : strre, strim)
+    for (BIGINT i = 0; i < M; ++i) {
+      kx[i] = randm11r(&se) * 3 * M_PI;
+      // kx[i]=2.0*kx[i] - 50.0;      //// to test folding within +-1 period
+      if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; // only fill needed coords
+      if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI;
+      d_nonuniform[i * 2]     = randm11r(&se);
+      d_nonuniform[i * 2 + 1] = randm11r(&se);
+      strre += d_nonuniform[2 * i];
+      strim += d_nonuniform[2 * i + 1];
     }
   }
   CNTime timer;
   double t;
-  if (dodir1) {   // test direction 1 (NU -> U spreading) ......................
-    printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",d,(double)Ng,opts.spread_direction,tol,opts.nspread);
+  if (dodir1) { // test direction 1 (NU -> U spreading) ......................
+    printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",
+           d,
+           (double)Ng,
+           opts.spread_direction,
+           tol,
+           opts.nspread);
     timer.start();
-    ier = spreadinterp(N,N2,N3,d_uniform.data(),M,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts);
-    t=timer.elapsedsec();
-    if (ier!=0) {
-      printf("error (ier=%d)!\n",ier);
+    ier = spreadinterp(N,
+                       N2,
+                       N3,
+                       d_uniform.data(),
+                       M,
+                       kx.data(),
+                       ky.data(),
+                       kz.data(),
+                       d_nonuniform.data(),
+                       opts);
+    t   = timer.elapsedsec();
+    if (ier != 0) {
+      printf("error (ier=%d)!\n", ier);
       return ier;
     } else
-      printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",(double)M,t,M/t,pow(opts.nspread,d)*M/t);
-  
-    FLT sumre = 0.0, sumim = 0.0;   // check spreading accuracy, wrapping
-#pragma omp parallel for reduction(+:sumre,sumim)
-    for (BIGINT i=0;i<Ng;++i) {
-      sumre += d_uniform[2*i]; 
-      sumim += d_uniform[2*i+1];
+      printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",
+             (double)M,
+             t,
+             M / t,
+             pow(opts.nspread, d) * M / t);
+
+    FLT sumre = 0.0, sumim = 0.0; // check spreading accuracy, wrapping
+#pragma omp parallel for reduction(+ : sumre, sumim)
+    for (BIGINT i = 0; i < Ng; ++i) {
+      sumre += d_uniform[2 * i];
+      sumim += d_uniform[2 * i + 1];
     }
-    FLT pre = kersumre*strre - kersumim*strim;   // pred ans, complex mult
-    FLT pim = kersumim*strre + kersumre*strim;
-    FLT maxerr = std::max(fabs(sumre-pre), fabs(sumim-pim));
-    FLT ansmod = sqrt(sumre*sumre+sumim*sumim);
-    printf("    rel err in total over grid:      %.3g\n",maxerr/ansmod);
+    FLT pre    = kersumre * strre - kersumim * strim; // pred ans, complex mult
+    FLT pim    = kersumim * strre + kersumre * strim;
+    FLT maxerr = std::max(fabs(sumre - pre), fabs(sumim - pim));
+    FLT ansmod = sqrt(sumre * sumre + sumim * sumim);
+    printf("    rel err in total over grid:      %.3g\n", maxerr / ansmod);
     // note this is weaker than below dir=2 test, but is good indicator that
     // periodic wrapping is correct
   }
 
   // test direction 2 (U -> NU interpolation) ..............................
   printf("making more random NU pts...\n");
-  for (BIGINT i=0;i<Ng;++i) {     // unit grid data
-    d_uniform[2*i] = 1.0;
-    d_uniform[2*i+1] = 0.0;
+  for (BIGINT i = 0; i < Ng; ++i) { // unit grid data
+    d_uniform[2 * i]     = 1.0;
+    d_uniform[2 * i + 1] = 0.0;
   }
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(dynamic,1000000)
-      for (BIGINT i=0; i<M; ++i) {       // random target pts
-        //kx[i]=10+.9*rand01r(&s)*N;   // or if want to keep ns away from edges
-        kx[i]=randm11r(&se)*3*M_PI;
-        if (d>1) ky[i]=randm11r(&se)*3*M_PI;
-        if (d>2) kz[i]=randm11r(&se)*3*M_PI;
-      }
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(dynamic, 1000000)
+    for (BIGINT i = 0; i < M; ++i) {           // random target pts
+      // kx[i]=10+.9*rand01r(&s)*N;   // or if want to keep ns away from edges
+      kx[i] = randm11r(&se) * 3 * M_PI;
+      if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI;
+      if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI;
+    }
   }
 
-  opts.spread_direction=2;
-  printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",d,(double)Ng,opts.spread_direction,tol,opts.nspread);
+  opts.spread_direction = 2;
+  printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n",
+         d,
+         (double)Ng,
+         opts.spread_direction,
+         tol,
+         opts.nspread);
   timer.restart();
-  ier = spreadinterp(N,N2,N3,d_uniform.data(),M,kx.data(),ky.data(),kz.data(),d_nonuniform.data(),opts);
-  t=timer.elapsedsec();
-  if (ier!=0) {
-    printf("error (ier=%d)!\n",ier);
+  ier = spreadinterp(N,
+                     N2,
+                     N3,
+                     d_uniform.data(),
+                     M,
+                     kx.data(),
+                     ky.data(),
+                     kz.data(),
+                     d_nonuniform.data(),
+                     opts);
+  t   = timer.elapsedsec();
+  if (ier != 0) {
+    printf("error (ier=%d)!\n", ier);
     return 1;
   } else
-    printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",(double)M,t,M/t,pow(opts.nspread,d)*M/t);
+    printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n",
+           (double)M,
+           t,
+           M / t,
+           pow(opts.nspread, d) * M / t);
 
   // math test is worst-case error from pred value (kersum) on interp pts:
   maxerr = 0.0;
-  for (BIGINT i=0;i<M;++i) {
-    FLT err = std::max(fabs(d_nonuniform[2*i]-kersumre),
-		       fabs(d_nonuniform[2*i+1]-kersumim));
-    if (err>maxerr) maxerr=err;
+  for (BIGINT i = 0; i < M; ++i) {
+    FLT err = std::max(fabs(d_nonuniform[2 * i] - kersumre),
+                       fabs(d_nonuniform[2 * i + 1] - kersumim));
+    if (err > maxerr) maxerr = err;
   }
-  ansmod = sqrt(kersumre*kersumre+kersumim*kersumim);
-  printf("    max rel err in values at NU pts: %.3g\n",maxerr/ansmod);
+  ansmod = sqrt(kersumre * kersumre + kersumim * kersumim);
+  printf("    max rel err in values at NU pts: %.3g\n", maxerr / ansmod);
   // this is stronger test than for dir=1, since it tests sum of kernel for
   // each NU pt. However, it cannot detect reading
   // from wrong grid pts (they are all unity)
diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp
index 55da3f978..666003137 100644
--- a/perftest/spreadtestndall.cpp
+++ b/perftest/spreadtestndall.cpp
@@ -11,12 +11,18 @@ using namespace finufft::spreadinterp;
 using namespace finufft::utils; // for timer
 
 void usage() {
-    printf("usage: spreadtestnd dims [M N [dir [sort [flags [debug [kerpad [kerevalmeth [upsampfac]]]]]]]]\n\twhere "
-           "dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform pts\n\tdir=spreader direction "
-           "[spread/interpolate]\n\tsort=0 (don't sort NU pts), 1 (do), or 2 (maybe sort; default)\n\tflags: expert "
-           "timing flags, 0 is default (see spreadinterp.h)\n\tdebug=0 (less text out), 1 (more), 2 (lots)\n\tkerpad=0 "
-           "(no pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), 1 (Horner "
-           "ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestndall 1 1e6 1e6 1 2 0 1\n");
+  printf(
+      "usage: spreadtestnd dims [M N [dir [sort [flags [debug [kerpad [kerevalmeth "
+      "[upsampfac]]]]]]]]\n\twhere "
+      "dims=1,2 or 3\n\tM=# nonuniform pts\n\tN=# uniform pts\n\tdir=spreader direction "
+      "[spread/interpolate]\n\tsort=0 (don't sort NU pts), 1 (do), or 2 (maybe sort; "
+      "default)\n\tflags: expert "
+      "timing flags, 0 is default (see spreadinterp.h)\n\tdebug=0 (less text out), 1 "
+      "(more), 2 (lots)\n\tkerpad=0 "
+      "(no pad to mult of 4), 1 (do, for kerevalmeth=0 only)\n\tkerevalmeth=0 (direct), "
+      "1 (Horner "
+      "ppval)\n\tupsampfac>1; 2 or 1.25 for Horner\n\nexample: ./spreadtestndall 1 1e6 "
+      "1e6 1 2 0 1\n");
 }
 
 int main(int argc, char *argv[])
@@ -28,249 +34,242 @@ int main(int argc, char *argv[])
  * Example: spreadtestndall 3 1e7 1e7 1 1
  *
  * Compilation (also check ../makefile):
- *    g++ spreadtestndall.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestndall -fPIC -Ofast -funroll-loops
- * -fopenmp
+ *    g++ spreadtestndall.cpp ../src/spreadinterp.o ../src/utils.o -o spreadtestndall
+ * -fPIC -Ofast -funroll-loops -fopenmp
  *
  */
 {
-    int d = 3; // Cmd line args & their defaults:  default #dims
-    double w;
-    int dir = 1;          // default (eg 1e-6 has nspread=7)
-    BIGINT M = 1e6;       // default # NU pts
-    BIGINT roughNg = 1e6; // default # U pts
-    int sort = 2;         // spread_sort
-    int flags = 0;        // default
-    int debug = 0;        // default
-    int kerpad = 0;       // default
-    int kerevalmeth = 1;  // default: Horner
-    FLT upsampfac = 2.0;  // standard
+  int d = 3;             // Cmd line args & their defaults:  default #dims
+  double w;
+  int dir         = 1;   // default (eg 1e-6 has nspread=7)
+  BIGINT M        = 1e6; // default # NU pts
+  BIGINT roughNg  = 1e6; // default # U pts
+  int sort        = 2;   // spread_sort
+  int flags       = 0;   // default
+  int debug       = 0;   // default
+  int kerpad      = 0;   // default
+  int kerevalmeth = 1;   // default: Horner
+  FLT upsampfac   = 2.0; // standard
 
-    if (argc < 2 || argc == 3 || argc > 11) {
-        usage();
-        return (argc > 1);
+  if (argc < 2 || argc == 3 || argc > 11) {
+    usage();
+    return (argc > 1);
+  }
+  sscanf(argv[1], "%d", &d);
+  if (d < 1 || d > 3) {
+    printf("d must be 1, 2 or 3!\n");
+    usage();
+    return 1;
+  }
+  if (argc > 2) {
+    sscanf(argv[2], "%lf", &w);
+    M = (BIGINT)w; // to read "1e6" right!
+    if (M < 1) {
+      printf("M (# NU pts) must be positive!\n");
+      usage();
+      return 1;
     }
-    sscanf(argv[1], "%d", &d);
-    if (d < 1 || d > 3) {
-        printf("d must be 1, 2 or 3!\n");
-        usage();
-        return 1;
-    }
-    if (argc > 2) {
-        sscanf(argv[2], "%lf", &w);
-        M = (BIGINT)w; // to read "1e6" right!
-        if (M < 1) {
-            printf("M (# NU pts) must be positive!\n");
-            usage();
-            return 1;
-        }
-        sscanf(argv[3], "%lf", &w);
-        roughNg = (BIGINT)w;
-        if (roughNg < 1) {
-            printf("N (# U pts) must be positive!\n");
-            usage();
-            return 1;
-        }
+    sscanf(argv[3], "%lf", &w);
+    roughNg = (BIGINT)w;
+    if (roughNg < 1) {
+      printf("N (# U pts) must be positive!\n");
+      usage();
+      return 1;
     }
-    if (argc > 4)
-        sscanf(argv[4], "%d", &dir);
-    if (argc > 5) {
-        sscanf(argv[5], "%d", &sort);
-        if ((sort != 0) && (sort != 1) && (sort != 2)) {
-            printf("sort must be 0, 1 or 2!\n");
-            usage();
-            return 1;
-        }
+  }
+  if (argc > 4) sscanf(argv[4], "%d", &dir);
+  if (argc > 5) {
+    sscanf(argv[5], "%d", &sort);
+    if ((sort != 0) && (sort != 1) && (sort != 2)) {
+      printf("sort must be 0, 1 or 2!\n");
+      usage();
+      return 1;
     }
-    if (argc > 6)
-        sscanf(argv[6], "%d", &flags);
-    if (argc > 7) {
-        sscanf(argv[7], "%d", &debug);
-        if ((debug < 0) || (debug > 2)) {
-            printf("debug must be 0, 1 or 2!\n");
-            usage();
-            return 1;
-        }
+  }
+  if (argc > 6) sscanf(argv[6], "%d", &flags);
+  if (argc > 7) {
+    sscanf(argv[7], "%d", &debug);
+    if ((debug < 0) || (debug > 2)) {
+      printf("debug must be 0, 1 or 2!\n");
+      usage();
+      return 1;
     }
-    if (argc > 8) {
-        sscanf(argv[8], "%d", &kerpad);
-        if ((kerpad < 0) || (kerpad > 1)) {
-            printf("kerpad must be 0 or 1!\n");
-            usage();
-            return 1;
-        }
+  }
+  if (argc > 8) {
+    sscanf(argv[8], "%d", &kerpad);
+    if ((kerpad < 0) || (kerpad > 1)) {
+      printf("kerpad must be 0 or 1!\n");
+      usage();
+      return 1;
     }
-    if (argc > 9) {
-        sscanf(argv[9], "%d", &kerevalmeth);
-        if ((kerevalmeth < 0) || (kerevalmeth > 1)) {
-            printf("kerevalmeth must be 0 or 1!\n");
-            usage();
-            return 1;
-        }
+  }
+  if (argc > 9) {
+    sscanf(argv[9], "%d", &kerevalmeth);
+    if ((kerevalmeth < 0) || (kerevalmeth > 1)) {
+      printf("kerevalmeth must be 0 or 1!\n");
+      usage();
+      return 1;
     }
-    if (argc > 10) {
-        sscanf(argv[10], "%lf", &w);
-        upsampfac = (FLT)w;
-        if (upsampfac <= 1.0) {
-            printf("upsampfac must be >1.0!\n");
-            usage();
-            return 1;
-        }
+  }
+  if (argc > 10) {
+    sscanf(argv[10], "%lf", &w);
+    upsampfac = (FLT)w;
+    if (upsampfac <= 1.0) {
+      printf("upsampfac must be >1.0!\n");
+      usage();
+      return 1;
     }
+  }
 
-    BIGINT N = (BIGINT)round(pow(roughNg, 1.0 / d));           // Fourier grid size per dim
-    BIGINT Ng = (BIGINT)pow(N, d);                             // actual total grid points
-    BIGINT N2 = (d >= 2) ? N : 1, N3 = (d == 3) ? N : 1;       // the y and z grid sizes
-    std::vector<FLT> kx(M), ky(1), kz(1), d_nonuniform(2 * M); // NU, Re & Im
-    if (d > 1)
-        ky.resize(M); // only alloc needed coords
-    if (d > 2)
-        kz.resize(M);
-    std::vector<FLT> d_uniform(2 * Ng); // Re and Im
+  BIGINT N  = (BIGINT)round(pow(roughNg, 1.0 / d));          // Fourier grid size per dim
+  BIGINT Ng = (BIGINT)pow(N, d);                             // actual total grid points
+  BIGINT N2 = (d >= 2) ? N : 1, N3 = (d == 3) ? N : 1;       // the y and z grid sizes
+  std::vector<FLT> kx(M), ky(1), kz(1), d_nonuniform(2 * M); // NU, Re & Im
+  if (d > 1) ky.resize(M);                                   // only alloc needed coords
+  if (d > 2) kz.resize(M);
+  std::vector<FLT> d_uniform(2 * Ng);                        // Re and Im
 
-    finufft_spread_opts opts;
-    const auto max_digits = []() {
-        if (std::is_same<FLT, double>::value) {
-            return 17;
-        } else {
-            return 9;
-        }
-    }();
-    for (int digits = 2; digits < max_digits; digits++) {
-        const auto tol = 10.0 * pow(10.0, -digits);
-        printf("digits=%d, tol = %.3g\n", digits, FLT(tol));
-        int ier_set = setup_spreader(opts, tol, upsampfac, kerevalmeth, debug, 1, d);
+  finufft_spread_opts opts;
+  const auto max_digits = []() {
+    if (std::is_same<FLT, double>::value) {
+      return 17;
+    } else {
+      return 9;
+    }
+  }();
+  for (int digits = 2; digits < max_digits; digits++) {
+    const auto tol = 10.0 * pow(10.0, -digits);
+    printf("digits=%d, tol = %.3g\n", digits, FLT(tol));
+    int ier_set = setup_spreader(opts, tol, upsampfac, kerevalmeth, debug, 1, d);
 
-        if (ier_set > 1) { // exit gracefully if can't set up.
-            printf("error when setting up spreader (ier_set=%d)!\n", ier_set);
-            return ier_set;
-        }
-        opts.debug = debug; // print more diagnostics?
-        opts.sort = sort;
-        opts.flags = flags;
-        opts.kerpad = kerpad;
-        opts.upsampfac = upsampfac;
-        opts.nthreads = 0; // max # threads used, or 0 to use what's avail
-        opts.sort_threads = 0;
-        opts.kerpad = 0;
-        // opts.max_subproblem_size = 1e5;
-        FLT maxerr, ansmod;
+    if (ier_set > 1) { // exit gracefully if can't set up.
+      printf("error when setting up spreader (ier_set=%d)!\n", ier_set);
+      return ier_set;
+    }
+    opts.debug        = debug; // print more diagnostics?
+    opts.sort         = sort;
+    opts.flags        = flags;
+    opts.kerpad       = kerpad;
+    opts.upsampfac    = upsampfac;
+    opts.nthreads     = 0; // max # threads used, or 0 to use what's avail
+    opts.sort_threads = 0;
+    opts.kerpad       = 0;
+    // opts.max_subproblem_size = 1e5;
+    FLT maxerr, ansmod;
 
-        // spread a single source, only for reference accuracy check...
-        opts.spread_direction = 1;
+    // spread a single source, only for reference accuracy check...
+    opts.spread_direction = 1;
 
-        d_nonuniform[0] = 1.0;
-        d_nonuniform[1] = 0.0;              // unit strength
-        kx[0] = ky[0] = kz[0] = M_PI / 2.0; // at center
-        int ier = spreadinterp(N, N2, N3, d_uniform.data(), 1, kx.data(), ky.data(), kz.data(), d_nonuniform.data(),
-                               opts); // vector::data officially C++11 but works
-        if (ier != 0) {
-            printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n", ier);
-            return ier;
-        }
-        FLT kersumre = 0.0, kersumim = 0.0; // sum kernel on uniform grid
-        for (BIGINT i = 0; i < Ng; ++i) {
-            kersumre += d_uniform[2 * i];
-            kersumim += d_uniform[2 * i + 1]; // in case the kernel isn't real!
-        }
+    d_nonuniform[0] = 1.0;
+    d_nonuniform[1] = 0.0;              // unit strength
+    kx[0] = ky[0] = kz[0] = M_PI / 2.0; // at center
+    int ier = spreadinterp(N, N2, N3, d_uniform.data(), 1, kx.data(), ky.data(),
+                           kz.data(), d_nonuniform.data(),
+                           opts); // vector::data officially C++11 but works
+    if (ier != 0) {
+      printf("error when spreading M=1 pt for ref acc check (ier=%d)!\n", ier);
+      return ier;
+    }
+    FLT kersumre = 0.0, kersumim = 0.0; // sum kernel on uniform grid
+    for (BIGINT i = 0; i < Ng; ++i) {
+      kersumre += d_uniform[2 * i];
+      kersumim += d_uniform[2 * i + 1]; // in case the kernel isn't real!
+    }
 
-        // now do the large-scale test w/ random sources..
-        printf("making random data...\n");
-        FLT strre = 0.0, strim = 0.0; // also sum the strengths
+    // now do the large-scale test w/ random sources..
+    printf("making random data...\n");
+    FLT strre = 0.0, strim = 0.0; // also sum the strengths
 #pragma omp parallel
-        {
-            unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+    {
+      unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
 #pragma omp for schedule(dynamic, 1000000) reduction(+ : strre, strim)
-            for (BIGINT i = 0; i < M; ++i) {
-                kx[i] = randm11r(&se) * 3 * M_PI;
-                // kx[i]=2.0*kx[i] - 50.0;      //// to test folding within +-1 period
-                if (d > 1)
-                    ky[i] = randm11r(&se) * 3 * M_PI; // only fill needed coords
-                if (d > 2)
-                    kz[i] = randm11r(&se) * 3 * M_PI;
-                d_nonuniform[i * 2] = randm11r(&se);
-                d_nonuniform[i * 2 + 1] = randm11r(&se);
-                strre += d_nonuniform[2 * i];
-                strim += d_nonuniform[2 * i + 1];
-            }
-        }
-        CNTime timer{};
-        double t;
-        if (dir == 1) { // test direction 1 (NU -> U spreading) ......................
-            printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n", d, (double)Ng, opts.spread_direction,
-                   tol, opts.nspread);
-            timer.start();
-            ier = spreadinterp(N, N2, N3, d_uniform.data(), M, kx.data(), ky.data(), kz.data(), d_nonuniform.data(),
-                               opts);
-            t = timer.elapsedsec();
-            if (ier != 0) {
-                printf("error (ier=%d)!\n", ier);
-                return ier;
-            } else
-                printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n", (double)M, t, M / t,
-                       pow(opts.nspread, d) * M / t);
+      for (BIGINT i = 0; i < M; ++i) {
+        kx[i] = randm11r(&se) * 3 * M_PI;
+        // kx[i]=2.0*kx[i] - 50.0;      //// to test folding within +-1 period
+        if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; // only fill needed coords
+        if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI;
+        d_nonuniform[i * 2]     = randm11r(&se);
+        d_nonuniform[i * 2 + 1] = randm11r(&se);
+        strre += d_nonuniform[2 * i];
+        strim += d_nonuniform[2 * i + 1];
+      }
+    }
+    CNTime timer{};
+    double t;
+    if (dir == 1) { // test direction 1 (NU -> U spreading) ......................
+      printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n", d,
+             (double)Ng, opts.spread_direction, tol, opts.nspread);
+      timer.start();
+      ier = spreadinterp(N, N2, N3, d_uniform.data(), M, kx.data(), ky.data(), kz.data(),
+                         d_nonuniform.data(), opts);
+      t   = timer.elapsedsec();
+      if (ier != 0) {
+        printf("error (ier=%d)!\n", ier);
+        return ier;
+      } else
+        printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n", (double)M,
+               t, M / t, pow(opts.nspread, d) * M / t);
 
-            FLT sumre = 0.0, sumim = 0.0; // check spreading accuracy, wrapping
+      FLT sumre = 0.0, sumim = 0.0; // check spreading accuracy, wrapping
 #pragma omp parallel for reduction(+ : sumre, sumim)
-            for (BIGINT i = 0; i < Ng; ++i) {
-                sumre += d_uniform[2 * i];
-                sumim += d_uniform[2 * i + 1];
-            }
-            FLT pre = kersumre * strre - kersumim * strim; // pred ans, complex mult
-            FLT pim = kersumim * strre + kersumre * strim;
-            FLT maxerr = std::max(fabs(sumre - pre), fabs(sumim - pim));
-            FLT ansmod = sqrt(sumre * sumre + sumim * sumim);
-            printf("    rel err in total over grid:      %.3g\n", maxerr / ansmod);
-            // note this is weaker than below dir=2 test, but is good indicator that
-            // periodic wrapping is correct
-        }
-        // test direction 2 (U -> NU interpolation) ..............................
-        if (dir == 2) {
-            printf("making more random NU pts...\n");
-            for (BIGINT i = 0; i < Ng; ++i) { // unit grid data
-                d_uniform[2 * i] = 1.0;
-                d_uniform[2 * i + 1] = 0.0;
-            }
+      for (BIGINT i = 0; i < Ng; ++i) {
+        sumre += d_uniform[2 * i];
+        sumim += d_uniform[2 * i + 1];
+      }
+      FLT pre    = kersumre * strre - kersumim * strim; // pred ans, complex mult
+      FLT pim    = kersumim * strre + kersumre * strim;
+      FLT maxerr = std::max(fabs(sumre - pre), fabs(sumim - pim));
+      FLT ansmod = sqrt(sumre * sumre + sumim * sumim);
+      printf("    rel err in total over grid:      %.3g\n", maxerr / ansmod);
+      // note this is weaker than below dir=2 test, but is good indicator that
+      // periodic wrapping is correct
+    }
+    // test direction 2 (U -> NU interpolation) ..............................
+    if (dir == 2) {
+      printf("making more random NU pts...\n");
+      for (BIGINT i = 0; i < Ng; ++i) { // unit grid data
+        d_uniform[2 * i]     = 1.0;
+        d_uniform[2 * i + 1] = 0.0;
+      }
 #pragma omp parallel
-            {
-                unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+      {
+        unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
 #pragma omp for schedule(dynamic, 1000000)
-                for (BIGINT i = 0; i < M; ++i) { // random target pts
-                    // kx[i]=10+.9*rand01r(&s)*N;   // or if want to keep ns away from edges
-                    kx[i] = randm11r(&se) * 3 * M_PI;
-                    if (d > 1)
-                        ky[i] = randm11r(&se) * 3 * M_PI;
-                    if (d > 2)
-                        kz[i] = randm11r(&se) * 3 * M_PI;
-                }
-            }
+        for (BIGINT i = 0; i < M; ++i) {           // random target pts
+          // kx[i]=10+.9*rand01r(&s)*N;   // or if want to keep ns away from edges
+          kx[i] = randm11r(&se) * 3 * M_PI;
+          if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI;
+          if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI;
+        }
+      }
 
-            opts.spread_direction = 2;
-            printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n", d, (double)Ng, opts.spread_direction,
-                   tol, opts.nspread);
-            timer.restart();
-            ier = spreadinterp(N, N2, N3, d_uniform.data(), M, kx.data(), ky.data(), kz.data(), d_nonuniform.data(),
-                               opts);
-            t = timer.elapsedsec();
-            if (ier != 0) {
-                printf("error (ier=%d)!\n", ier);
-                return 1;
-            } else
-                printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n", (double)M, t, M / t,
-                       pow(opts.nspread, d) * M / t);
+      opts.spread_direction = 2;
+      printf("spreadinterp %dD, %.3g U pts, dir=%d, tol=%.3g: nspread=%d\n", d,
+             (double)Ng, opts.spread_direction, tol, opts.nspread);
+      timer.restart();
+      ier = spreadinterp(N, N2, N3, d_uniform.data(), M, kx.data(), ky.data(), kz.data(),
+                         d_nonuniform.data(), opts);
+      t   = timer.elapsedsec();
+      if (ier != 0) {
+        printf("error (ier=%d)!\n", ier);
+        return 1;
+      } else
+        printf("    %.3g NU pts in %.3g s \t%.3g pts/s \t%.3g spread pts/s\n", (double)M,
+               t, M / t, pow(opts.nspread, d) * M / t);
 
-            // math test is worst-case error from pred value (kersum) on interp pts:
-            maxerr = 0.0;
-            for (BIGINT i = 0; i < M; ++i) {
-                FLT err = std::max(fabs(d_nonuniform[2 * i] - kersumre), fabs(d_nonuniform[2 * i + 1] - kersumim));
-                if (err > maxerr)
-                    maxerr = err;
-            }
-            ansmod = sqrt(kersumre * kersumre + kersumim * kersumim);
-            printf("    max rel err in values at NU pts: %.3g\n", maxerr / ansmod);
-            // this is stronger test than for dir=1, since it tests sum of kernel for
-            // each NU pt. However, it cannot detect reading
-            // from wrong grid pts (they are all unity)
-        }
+      // math test is worst-case error from pred value (kersum) on interp pts:
+      maxerr = 0.0;
+      for (BIGINT i = 0; i < M; ++i) {
+        FLT err = std::max(fabs(d_nonuniform[2 * i] - kersumre),
+                           fabs(d_nonuniform[2 * i + 1] - kersumim));
+        if (err > maxerr) maxerr = err;
+      }
+      ansmod = sqrt(kersumre * kersumre + kersumim * kersumim);
+      printf("    max rel err in values at NU pts: %.3g\n", maxerr / ansmod);
+      // this is stronger test than for dir=1, since it tests sum of kernel for
+      // each NU pt. However, it cannot detect reading
+      // from wrong grid pts (they are all unity)
     }
-    return 0;
+  }
+  return 0;
 }
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 62574c06b..26eaff491 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -16,8 +16,9 @@
 using namespace cufinufft::deconvolve;
 using namespace cufinufft::spreadinterp;
 
-template <typename T>
-int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     1D Type-1 NUFFT
 
@@ -31,48 +32,48 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 11/21/21
 */
 {
-    assert(d_plan->spopts.spread_direction == 1);
-    auto &stream = d_plan->stream;
-
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // this is needed
-        if ((ier = checkCudaErrors(
-                 cudaMemsetAsync(d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex<T>), stream))))
-            return ier;
-
-        // Step 1: Spread
-        if ((ier = cuspread1d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if (d_plan->opts.modeord == 0) {
-            if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize)))
-                return ier;
-        } else {
-            if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize)))
-                return ier;
-        }
+  assert(d_plan->spopts.spread_direction == 1);
+  auto &stream = d_plan->stream;
+
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize =
+        std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart   = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart  = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    // this is needed
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex<T>),
+             stream))))
+      return ier;
+
+    // Step 1: Spread
+    if ((ier = cuspread1d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if (d_plan->opts.modeord == 0) {
+      if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
+    } else {
+      if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
     }
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     1D Type-2 NUFFT
 
@@ -86,46 +87,46 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 11/21/21
 */
 {
-    assert(d_plan->spopts.spread_direction == 2);
-
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
-
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if (d_plan->opts.modeord == 0) {
-            if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize)))
-                return ier;
-        } else {
-            if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize)))
-                return ier;
-        }
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cuinterp1d<T>(d_plan, blksize)))
-            return ier;
+  assert(d_plan->spopts.spread_direction == 2);
+
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize =
+        std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart  = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
+
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+    if (d_plan->opts.modeord == 0) {
+      if ((ier = cudeconvolve1d<T, 0>(d_plan, blksize))) return ier;
+    } else {
+      if ((ier = cudeconvolve1d<T, 1>(d_plan, blksize))) return ier;
     }
 
-    return 0;
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cuinterp1d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
 template int cufinufft1d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft1d1_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft1d1_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
 template int cufinufft1d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft1d2_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft1d2_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu
index 0940f10de..cd3637c8b 100644
--- a/src/cuda/1d/interp1d_wrapper.cu
+++ b/src/cuda/1d/interp1d_wrapper.cu
@@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different interpolation methods.
@@ -26,58 +26,60 @@ int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 11/21/21
 */
 {
-    int nf1 = d_plan->nf1;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->opts.upsampfac;
-    int *d_idxnupts = d_plan->idxnupts;
-
-    T *d_kx = d_plan->kx;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 32;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+  auto &stream = d_plan->stream;
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int ns          = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c          = d_plan->spopts.ES_c;
+  T es_beta       = d_plan->spopts.ES_beta;
+  T sigma         = d_plan->opts.upsampfac;
+  int *d_idxnupts = d_plan->idxnupts;
+
+  T *d_kx               = d_plan->kx;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  threadsPerBlock.x = 32;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuinterp1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu
index c41ce0919..e72ade469 100644
--- a/src/cuda/1d/spread1d_wrapper.cu
+++ b/src/cuda/1d/spread1d_wrapper.cu
@@ -19,7 +19,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different spreading methods.
@@ -31,143 +31,52 @@ int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 11/21/21
 */
 {
-    int nf1 = d_plan->nf1;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuspread1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuspread1d_subprob<T>(nf1, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuspread1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuspread1d_subprob<T>(nf1, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
-
-    if (d_plan->opts.gpu_sort) {
-        int bin_size_x = d_plan->opts.gpu_binsizex;
-        if (bin_size_x < 0) {
-            std::cerr << "[cuspread1d_nuptsdriven_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n";
-            return FINUFFT_ERR_BINSIZE_NOTVALID;
-        }
-
-        int numbins = ceil((T)nf1 / bin_size_x);
-
-        T *d_kx = d_plan->kx;
-
-        int *d_binsize = d_plan->binsize;
-        int *d_binstartpts = d_plan->binstartpts;
-        int *d_sortidx = d_plan->sortidx;
-        int *d_idxnupts = d_plan->idxnupts;
-
-            int ier;
-        if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
-            return ier;
-        calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, nf1, bin_size_x, numbins, d_binsize,
-                                                                             d_kx, d_sortidx);
-        RETURN_IF_CUDA_ERROR
-
-        int n = numbins;
-        thrust::device_ptr<int> d_ptr(d_binsize);
-        thrust::device_ptr<int> d_result(d_binstartpts);
-        thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-        calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1);
-        RETURN_IF_CUDA_ERROR
-    } else {
-        int *d_idxnupts = d_plan->idxnupts;
-        trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts);
-        RETURN_IF_CUDA_ERROR
-    }
-
-    return 0;
-}
-
-template <typename T>
-int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    int *d_idxnupts = d_plan->idxnupts;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->spopts.upsampfac;
-
-    T *d_kx = d_plan->kx;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 16;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            spread_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    }
-
-    return 0;
-}
-
-template <typename T>
-int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
-/*
-    This function determines the properties for spreading that are independent
-    of the strength of the nodes,  only relates to the locations of the nodes,
-    which only needs to be done once.
-*/
-{
-    auto &stream = d_plan->stream;
-    int ier;
+  auto &stream = d_plan->stream;
 
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  if (d_plan->opts.gpu_sort) {
     int bin_size_x = d_plan->opts.gpu_binsizex;
     if (bin_size_x < 0) {
-        std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n";
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
+      std::cerr << "[cuspread1d_nuptsdriven_prop] error: invalid binsize (binsizex) = ("
+                << bin_size_x << ")\n";
+      return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
 
     int numbins = ceil((T)nf1 / bin_size_x);
 
     T *d_kx = d_plan->kx;
 
-    int *d_binsize = d_plan->binsize;
+    int *d_binsize     = d_plan->binsize;
     int *d_binstartpts = d_plan->binstartpts;
-    int *d_sortidx = d_plan->sortidx;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
-
-    int *d_subprob_to_bin = nullptr;
+    int *d_sortidx     = d_plan->sortidx;
+    int *d_idxnupts    = d_plan->idxnupts;
 
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
-        return ier;
-    calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, nf1, bin_size_x, numbins, d_binsize, d_kx,
-                                                                         d_sortidx);
+    int ier;
+    if ((ier = checkCudaErrors(
+             cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
+      return ier;
+    calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+        M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx);
     RETURN_IF_CUDA_ERROR
 
     int n = numbins;
@@ -178,101 +87,207 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
     calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
         M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1);
     RETURN_IF_CUDA_ERROR
-
-    calc_subprob_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize, numbins);
+  } else {
+    int *d_idxnupts = d_plan->idxnupts;
+    trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
+                                                                             d_idxnupts);
     RETURN_IF_CUDA_ERROR
+  }
 
-    d_ptr = thrust::device_pointer_cast(d_numsubprob);
-    d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
-        return ier;
+  return 0;
+}
 
-    int totalnumsubprob;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream))))
-        return ier;
-    cudaStreamSynchronize(stream);
-    if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
-        return ier;
-    map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(d_subprob_to_bin, d_subprobstartpts,
-                                                                            d_numsubprob, numbins);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_subprob_to_bin);
-        return FINUFFT_ERR_CUDA_FAILURE;
+template<typename T>
+int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+  auto &stream = d_plan->stream;
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int ns          = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  int *d_idxnupts = d_plan->idxnupts;
+  T es_c          = d_plan->spopts.ES_c;
+  T es_beta       = d_plan->spopts.ES_beta;
+  T sigma         = d_plan->spopts.upsampfac;
+
+  T *d_kx               = d_plan->kx;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  threadsPerBlock.x = 16;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      spread_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      spread_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
 
-    assert(d_subprob_to_bin != NULL);
-    cudaFreeAsync(d_plan->subprob_to_bin, stream);
-    d_plan->subprob_to_bin = d_subprob_to_bin;
-    d_plan->totalnumsubprob = totalnumsubprob;
-
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
+template<typename T>
+int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
+/*
+    This function determines the properties for spreading that are independent
+    of the strength of the nodes,  only relates to the locations of the nodes,
+    which only needs to be done once.
+*/
+{
+  auto &stream = d_plan->stream;
+  int ier;
 
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int bin_size_x     = d_plan->opts.gpu_binsizex;
+  if (bin_size_x < 0) {
+    std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = ("
+              << bin_size_x << ")\n";
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
 
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int numbins = ceil((T)nf1 / bin_size_x);
+  int numbins = ceil((T)nf1 / bin_size_x);
 
-    T *d_kx = d_plan->kx;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
+  T *d_kx = d_plan->kx;
 
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_sortidx         = d_plan->sortidx;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
 
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
+  int *d_subprob_to_bin = nullptr;
 
+  if ((ier =
+           checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream))))
+    return ier;
+  calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx);
+  RETURN_IF_CUDA_ERROR
 
-    T sigma = d_plan->opts.upsampfac;
+  int n = numbins;
+  thrust::device_ptr<int> d_ptr(d_binsize);
+  thrust::device_ptr<int> d_result(d_binstartpts);
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
-    size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n";
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-    }
+  calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1);
+  RETURN_IF_CUDA_ERROR
+
+  calc_subprob_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob,
+                                                              maxsubprobsize, numbins);
+  RETURN_IF_CUDA_ERROR
+
+  d_ptr    = thrust::device_pointer_cast(d_numsubprob);
+  d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize,
-                bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_1d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize,
-                bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+  if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
+    return ier;
+
+  int totalnumsubprob;
+  if ((ier =
+           checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
+                                           sizeof(int), cudaMemcpyDeviceToHost, stream))))
+    return ier;
+  cudaStreamSynchronize(stream);
+  if ((ier = checkCudaErrors(
+           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+    return ier;
+  map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_subprob_to_bin);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  assert(d_subprob_to_bin != NULL);
+  cudaFreeAsync(d_plan->subprob_to_bin, stream);
+  d_plan->subprob_to_bin  = d_subprob_to_bin;
+  d_plan->totalnumsubprob = totalnumsubprob;
+
+  return 0;
+}
+
+template<typename T>
+int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int numbins    = ceil((T)nf1 / bin_size_x);
+
+  T *d_kx               = d_plan->kx;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int totalnumsubprob   = d_plan->totalnumsubprob;
+  int *d_subprob_to_bin = d_plan->subprob_to_bin;
+
+  T sigma = d_plan->opts.upsampfac;
+
+  size_t sharedplanorysize =
+      (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n";
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts,
+          d_numsubprob, maxsubprobsize, numbins, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      spread_1d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma,
+          d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts,
+          d_numsubprob, maxsubprobsize, numbins, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuspread1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread1d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cuspread1d_nuptsdriven_prop<float>(int nf1, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread1d_nuptsdriven_prop<double>(int nf1, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread1d_subprob_prop<float>(int nf1, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread1d_subprob_prop<double>(int nf1, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread1d_nuptsdriven_prop<float>(int nf1, int M,
+                                                cufinufft_plan_t<float> *d_plan);
+template int cuspread1d_nuptsdriven_prop<double>(int nf1, int M,
+                                                 cufinufft_plan_t<double> *d_plan);
+template int cuspread1d_subprob_prop<float>(int nf1, int M,
+                                            cufinufft_plan_t<float> *d_plan);
+template int cuspread1d_subprob_prop<double>(int nf1, int M,
+                                             cufinufft_plan_t<double> *d_plan);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index 3cd85281f..afc801b7f 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -15,8 +15,9 @@ using namespace cufinufft::deconvolve;
 using namespace cufinufft::spreadinterp;
 using std::min;
 
-template <typename T>
-int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     2D Type-1 NUFFT
 
@@ -30,49 +31,49 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    assert(d_plan->spopts.spread_direction == 1);
-
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-
-    auto &stream = d_plan->stream;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // this is needed
-        if ((ier = checkCudaErrors(cudaMemsetAsync(
-                 d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex<T>), stream))))
-            return ier;
-
-        // Step 1: Spread
-        if ((ier = cuspread2d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if (d_plan->opts.modeord == 0) {
-            if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize)))
-                return ier;
-        } else {
-            if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize)))
-                return ier;
-        }
+  assert(d_plan->spopts.spread_direction == 1);
+
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+
+  auto &stream = d_plan->stream;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
+    d_plan->c   = d_cstart;
+    d_plan->fk  = d_fkstart;
+
+    // this is needed
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_plan->fw, 0,
+             d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex<T>),
+             stream))))
+      return ier;
+
+    // Step 1: Spread
+    if ((ier = cuspread2d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if (d_plan->opts.modeord == 0) {
+      if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
+    } else {
+      if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
     }
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     2D Type-2 NUFFT
 
@@ -86,46 +87,45 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    assert(d_plan->spopts.spread_direction == 2);
-
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
-
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if (d_plan->opts.modeord == 0) {
-            if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize)))
-                return ier;
-        } else {
-            if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize)))
-                return ier;
-        }
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cuinterp2d<T>(d_plan, blksize)))
-            return ier;
+  assert(d_plan->spopts.spread_direction == 2);
+
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
+
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+    if (d_plan->opts.modeord == 0) {
+      if ((ier = cudeconvolve2d<T, 0>(d_plan, blksize))) return ier;
+    } else {
+      if ((ier = cudeconvolve2d<T, 1>(d_plan, blksize))) return ier;
     }
 
-    return 0;
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cuinterp2d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
 template int cufinufft2d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft2d1_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft2d1_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
 template int cufinufft2d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft2d2_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft2d2_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu
index c62188e90..533788482 100644
--- a/src/cuda/2d/interp2d_wrapper.cu
+++ b/src/cuda/2d/interp2d_wrapper.cu
@@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different interpolation methods.
@@ -26,127 +26,130 @@ int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuinterp2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuinterp2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuinterp2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuinterp2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
-int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->opts.upsampfac;
-    
-    int *d_idxnupts = d_plan->idxnupts;
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 32;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            interp_2d_nupts_driven<T, 1>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2,
-                                                         es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            interp_2d_nupts_driven<T, 0>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2,
-                                                         es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+template<typename T>
+int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
+
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  T sigma   = d_plan->opts.upsampfac;
+
+  int *d_idxnupts = d_plan->idxnupts;
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  threadsPerBlock.x = 32;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      interp_2d_nupts_driven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      interp_2d_nupts_driven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int numbins[2];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    
-
-    T sigma = d_plan->opts.upsampfac;
-    size_t sharedplanorysize =
-        (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+template<typename T>
+int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int numbins[2];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+  int *d_subprob_to_bin  = d_plan->subprob_to_bin;
+  int totalnumsubprob    = d_plan->totalnumsubprob;
+
+  T sigma                  = d_plan->opts.upsampfac;
+  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
+                             (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
+
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
+          d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1],
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
-                numbins[0], numbins[1], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
-                numbins[0], numbins[1], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
+          d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1],
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuinterp2d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
index 3b27f7efd..8c32d316e 100644
--- a/src/cuda/2d/spread2d_wrapper.cu
+++ b/src/cuda/2d/spread2d_wrapper.cu
@@ -19,7 +19,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different spreading methods.
@@ -31,135 +31,40 @@ int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuspread2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuspread2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuspread2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuspread2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
-
-    if (d_plan->opts.gpu_sort) {
-        int bin_size_x = d_plan->opts.gpu_binsizex;
-        int bin_size_y = d_plan->opts.gpu_binsizey;
-        if (bin_size_x < 0 || bin_size_y < 0) {
-            std::cerr << "[cuspread2d_nuptsdriven_prop] error: invalid binsize (binsizex, binsizey) = (";
-            std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
-            return FINUFFT_ERR_BINSIZE_NOTVALID;
-        }
-
-        int numbins[2];
-        numbins[0] = ceil((T)nf1 / bin_size_x);
-        numbins[1] = ceil((T)nf2 / bin_size_y);
-
-        T *d_kx = d_plan->kx;
-        T *d_ky = d_plan->ky;
-
-        int *d_binsize = d_plan->binsize;
-        int *d_binstartpts = d_plan->binstartpts;
-        int *d_sortidx = d_plan->sortidx;
-        int *d_idxnupts = d_plan->idxnupts;
-
-        int ier;
-        if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream))))
-            return ier;
-
-        calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky, d_sortidx);
-        RETURN_IF_CUDA_ERROR
-
-        int n = numbins[0] * numbins[1];
-        thrust::device_ptr<int> d_ptr(d_binsize);
-        thrust::device_ptr<int> d_result(d_binstartpts);
-        thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-        calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, d_ky, d_idxnupts, nf1, nf2);
-        RETURN_IF_CUDA_ERROR
-    } else {
-        int *d_idxnupts = d_plan->idxnupts;
-
-        trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts);
-        RETURN_IF_CUDA_ERROR
-    }
-
-    return 0;
-}
-
-template <typename T>
-int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    int *d_idxnupts = d_plan->idxnupts;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->spopts.upsampfac;
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 16;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            spread_2d_nupts_driven<T, 1>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2,
-                                                         es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_2d_nupts_driven<T, 0>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2,
-                                                         es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    }
+  auto &stream = d_plan->stream;
 
-    return 0;
-}
-
-template <typename T>
-int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan)
-/*
-    This function determines the properties for spreading that are independent
-    of the strength of the nodes,  only relates to the locations of the nodes,
-    which only needs to be done once.
-*/
-{
-    auto &stream = d_plan->stream;
-
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  if (d_plan->opts.gpu_sort) {
     int bin_size_x = d_plan->opts.gpu_binsizex;
     int bin_size_y = d_plan->opts.gpu_binsizey;
     if (bin_size_x < 0 || bin_size_y < 0) {
-        std::cerr << "[cuspread2d_subprob_prop] error: invalid binsize (binsizex, binsizey) = (";
-        std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
+      std::cerr << "[cuspread2d_nuptsdriven_prop] error: invalid binsize "
+                   "(binsizex, binsizey) = (";
+      std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
+      return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
+
     int numbins[2];
     numbins[0] = ceil((T)nf1 / bin_size_x);
     numbins[1] = ceil((T)nf2 / bin_size_y);
@@ -167,21 +72,19 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
     T *d_kx = d_plan->kx;
     T *d_ky = d_plan->ky;
 
-    int *d_binsize = d_plan->binsize;
+    int *d_binsize     = d_plan->binsize;
     int *d_binstartpts = d_plan->binstartpts;
-    int *d_sortidx = d_plan->sortidx;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
-
-    int *d_subprob_to_bin = NULL;
+    int *d_sortidx     = d_plan->sortidx;
+    int *d_idxnupts    = d_plan->idxnupts;
 
     int ier;
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream))))
-        return ier;
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream))))
+      return ier;
 
     calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky, d_sortidx);
+        M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx,
+        d_ky, d_sortidx);
     RETURN_IF_CUDA_ERROR
 
     int n = numbins[0] * numbins[1];
@@ -190,110 +93,226 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
     thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
     calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, d_ky, d_idxnupts,
-        nf1, nf2);
-    RETURN_IF_CUDA_ERROR
-    calc_subprob_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize,
-                                                                numbins[0] * numbins[1]);
+        M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx,
+        d_ky, d_idxnupts, nf1, nf2);
     RETURN_IF_CUDA_ERROR
+  } else {
+    int *d_idxnupts = d_plan->idxnupts;
 
-    d_ptr = thrust::device_pointer_cast(d_numsubprob);
-    d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
-        return ier;
-
-    int totalnumsubprob;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream))))
-        return ier;
-    cudaStreamSynchronize(stream);
-    if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
-        return ier;
-    map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_subprob_to_bin);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
-
-    assert(d_subprob_to_bin != NULL);
-    cudaFreeAsync(d_plan->subprob_to_bin, stream);
-    d_plan->subprob_to_bin = d_subprob_to_bin;
-    d_plan->totalnumsubprob = totalnumsubprob;
+    trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
+                                                                             d_idxnupts);
+    RETURN_IF_CUDA_ERROR
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int numbins[2];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
+template<typename T>
+int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int ns          = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  int *d_idxnupts = d_plan->idxnupts;
+  T es_c          = d_plan->spopts.ES_c;
+  T es_beta       = d_plan->spopts.ES_beta;
+  T sigma         = d_plan->spopts.upsampfac;
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  threadsPerBlock.x = 16;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      spread_2d_nupts_driven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      spread_2d_nupts_driven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
 
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
+  return 0;
+}
 
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+template<typename T>
+int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan)
+/*
+    This function determines the properties for spreading that are independent
+    of the strength of the nodes,  only relates to the locations of the nodes,
+    which only needs to be done once.
+*/
+{
+  auto &stream = d_plan->stream;
+
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int bin_size_x     = d_plan->opts.gpu_binsizex;
+  int bin_size_y     = d_plan->opts.gpu_binsizey;
+  if (bin_size_x < 0 || bin_size_y < 0) {
+    std::cerr << "[cuspread2d_subprob_prop] error: invalid binsize (binsizex, "
+                 "binsizey) = (";
+    std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
+  int numbins[2];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+
+  T *d_kx = d_plan->kx;
+  T *d_ky = d_plan->ky;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_sortidx         = d_plan->sortidx;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int *d_subprob_to_bin = NULL;
+
+  int ier;
+  if ((ier = checkCudaErrors(
+           cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int), stream))))
+    return ier;
 
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
+  calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky,
+      d_sortidx);
+  RETURN_IF_CUDA_ERROR
+
+  int n = numbins[0] * numbins[1];
+  thrust::device_ptr<int> d_ptr(d_binsize);
+  thrust::device_ptr<int> d_result(d_binstartpts);
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+
+  calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx,
+      d_ky, d_idxnupts, nf1, nf2);
+  RETURN_IF_CUDA_ERROR
+  calc_subprob_2d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_binsize, d_numsubprob, maxsubprobsize, numbins[0] * numbins[1]);
+  RETURN_IF_CUDA_ERROR
+
+  d_ptr    = thrust::device_pointer_cast(d_numsubprob);
+  d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+
+  if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
+    return ier;
 
-    T sigma = d_plan->opts.upsampfac;
+  int totalnumsubprob;
+  if ((ier =
+           checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
+                                           sizeof(int), cudaMemcpyDeviceToHost, stream))))
+    return ier;
+  cudaStreamSynchronize(stream);
+  if ((ier = checkCudaErrors(
+           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+    return ier;
+  map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1]);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_subprob_to_bin);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  assert(d_subprob_to_bin != NULL);
+  cudaFreeAsync(d_plan->subprob_to_bin, stream);
+  d_plan->subprob_to_bin  = d_subprob_to_bin;
+  d_plan->totalnumsubprob = totalnumsubprob;
+
+  return 0;
+}
 
-    size_t sharedplanorysize =
-        (bin_size_x + 2 * (int)ceil(ns / 2.0)) * (bin_size_y + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+template<typename T>
+int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int numbins[2];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int totalnumsubprob   = d_plan->totalnumsubprob;
+  int *d_subprob_to_bin = d_plan->subprob_to_bin;
+
+  T sigma = d_plan->opts.upsampfac;
+
+  size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) *
+                             (bin_size_y + 2 * (int)ceil(ns / 2.0)) *
+                             sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
+          d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1],
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
-                numbins[0], numbins[1], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
-                numbins[0], numbins[1], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+  } else {
+    for (int t = 0; t < blksize; t++) {
+      spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta,
+          sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin,
+          d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0], numbins[1],
+          d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuspread2d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread2d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cuspread2d_subprob_prop<float>(int nf1, int nf2, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread2d_subprob_prop<double>(int nf1, int nf2, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread2d_nuptsdriven_prop<float>(int nf1, int nf2, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread2d_nuptsdriven_prop<double>(int nf1, int nf2, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread2d_subprob_prop<float>(int nf1, int nf2, int M,
+                                            cufinufft_plan_t<float> *d_plan);
+template int cuspread2d_subprob_prop<double>(int nf1, int nf2, int M,
+                                             cufinufft_plan_t<double> *d_plan);
+template int cuspread2d_nuptsdriven_prop<float>(int nf1, int nf2, int M,
+                                                cufinufft_plan_t<float> *d_plan);
+template int cuspread2d_nuptsdriven_prop<double>(int nf1, int nf2, int M,
+                                                 cufinufft_plan_t<double> *d_plan);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index c1209039a..ea0ef4a86 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -15,8 +15,9 @@ using namespace cufinufft::deconvolve;
 using namespace cufinufft::spreadinterp;
 using std::min;
 
-template <typename T>
-int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     3D Type-1 NUFFT
 
@@ -30,47 +31,47 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    auto &stream = d_plan->stream;
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
-
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        if ((ier = checkCudaErrors(cudaMemsetAsync(
-                 d_plan->fw, 0,
-                 d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * d_plan->nf3 * sizeof(cuda_complex<T>), stream))))
-            return ier;
-
-        // Step 1: Spread
-        if ((ier = cuspread3d<T>(d_plan, blksize)))
-            return ier;
-
-        // Step 2: FFT
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if (d_plan->opts.modeord == 0) {
-            if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize)))
-                return ier;
-        } else {
-            if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize)))
-                return ier;
-        }
+  auto &stream = d_plan->stream;
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
+
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    if ((ier = checkCudaErrors(
+             cudaMemsetAsync(d_plan->fw, 0,
+                             d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 *
+                                 d_plan->nf3 * sizeof(cuda_complex<T>),
+                             stream))))
+      return ier;
+
+    // Step 1: Spread
+    if ((ier = cuspread3d<T>(d_plan, blksize))) return ier;
+
+    // Step 2: FFT
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if (d_plan->opts.modeord == 0) {
+      if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
+    } else {
+      if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
     }
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan)
+template<typename T>
+int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk,
+                      cufinufft_plan_t<T> *d_plan)
 /*
     3D Type-2 NUFFT
 
@@ -84,46 +85,45 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    int ier;
-    cuda_complex<T> *d_fkstart;
-    cuda_complex<T> *d_cstart;
-    for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
-        d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
-        d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
-
-        d_plan->c = d_cstart;
-        d_plan->fk = d_fkstart;
-
-        // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        if (d_plan->opts.modeord == 0) {
-            if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize)))
-                return ier;
-        } else {
-            if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize)))
-                return ier;
-        }
-
-        // Step 2: FFT
-        RETURN_IF_CUDA_ERROR
-        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
-        if (cufft_status != CUFFT_SUCCESS)
-            return FINUFFT_ERR_CUDA_FAILURE;
-
-        // Step 3: deconvolve and shuffle
-        if ((ier = cuinterp3d<T>(d_plan, blksize)))
-            return ier;
+  int ier;
+  cuda_complex<T> *d_fkstart;
+  cuda_complex<T> *d_cstart;
+  for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
+    int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+    d_cstart    = d_c + i * d_plan->maxbatchsize * d_plan->M;
+    d_fkstart   = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
+
+    d_plan->c  = d_cstart;
+    d_plan->fk = d_fkstart;
+
+    // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
+    if (d_plan->opts.modeord == 0) {
+      if ((ier = cudeconvolve3d<T, 0>(d_plan, blksize))) return ier;
+    } else {
+      if ((ier = cudeconvolve3d<T, 1>(d_plan, blksize))) return ier;
     }
 
-    return 0;
+    // Step 2: FFT
+    RETURN_IF_CUDA_ERROR
+    cufftResult cufft_status =
+        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+    if (cufft_status != CUFFT_SUCCESS) return FINUFFT_ERR_CUDA_FAILURE;
+
+    // Step 3: deconvolve and shuffle
+    if ((ier = cuinterp3d<T>(d_plan, blksize))) return ier;
+  }
+
+  return 0;
 }
 
 template int cufinufft3d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft3d1_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft3d1_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
 
 template int cufinufft3d2_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft3d2_exec<double>(cuda_complex<double> *d_c, cuda_complex<double> *d_fk,
+template int cufinufft3d2_exec<double>(cuda_complex<double> *d_c,
+                                       cuda_complex<double> *d_fk,
                                        cufinufft_plan_t<double> *d_plan);
diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu
index 9cdceccd0..b42231d86 100644
--- a/src/cuda/3d/interp3d_wrapper.cu
+++ b/src/cuda/3d/interp3d_wrapper.cu
@@ -14,7 +14,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different interpolation methods.
@@ -26,141 +26,147 @@ int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int M = d_plan->M;
-
-    int ier;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuinterp3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuinterp3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuinterp3d] error: incorrect method, should be 1,2\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int nf3 = d_plan->nf3;
+  int M   = d_plan->M;
+
+  int ier;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuinterp3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuinterp3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuinterp3d] error: incorrect method, should be 1,2\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
-int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->spopts.upsampfac;
-
-    int *d_idxnupts = d_plan->idxnupts;
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 16;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth) {
-        for (int t = 0; t < blksize; t++) {
-            interp_3d_nupts_driven<T, 1>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M,
-                                                         ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            interp_3d_nupts_driven<T, 0>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M,
-                                                         ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    }
+template<typename T>
+int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
 
-    return 0;
-}
+  dim3 threadsPerBlock;
+  dim3 blocks;
 
-template <typename T>
-int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int bin_size_z = d_plan->opts.gpu_binsizez;
-    int numbins[3];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-    numbins[2] = ceil((T)nf3 / bin_size_z);
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
-    int totalnumsubprob = d_plan->totalnumsubprob;
-
-    T sigma = d_plan->spopts.upsampfac;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) *
-                               (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n";
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-    }
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
+  T sigma   = d_plan->spopts.upsampfac;
+
+  int *d_idxnupts = d_plan->idxnupts;
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
 
+  threadsPerBlock.x = 16;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
+
+  if (d_plan->opts.gpu_kerevalmeth) {
+    for (int t = 0; t < blksize; t++) {
+      interp_3d_nupts_driven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  } else {
     for (int t = 0; t < blksize; t++) {
-        if (d_plan->opts.gpu_kerevalmeth == 1) {
-            interp_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
-                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
-                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        } else {
-            interp_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
-                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
-                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+      interp_3d_nupts_driven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
+
+  return 0;
+}
+
+template<typename T>
+int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int bin_size_z = d_plan->opts.gpu_binsizez;
+  int numbins[3];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+  numbins[2] = ceil((T)nf3 / bin_size_z);
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+  int *d_subprob_to_bin  = d_plan->subprob_to_bin;
+  int totalnumsubprob    = d_plan->totalnumsubprob;
+
+  T sigma                  = d_plan->spopts.upsampfac;
+  T es_c                   = d_plan->spopts.ES_c;
+  T es_beta                = d_plan->spopts.ES_beta;
+  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
+                             (bin_size_y + 2 * ceil(ns / 2.0)) *
+                             (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n";
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  for (int t = 0; t < blksize; t++) {
+    if (d_plan->opts.gpu_kerevalmeth == 1) {
+      interp_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
+          bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+          numbins[0], numbins[1], numbins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    } else {
+      interp_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
+          bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+          numbins[0], numbins[1], numbins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuinterp3d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuinterp3d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
 
-template int cuinterp3d_nuptsdriven<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan,
-                                           int blksize);
-template int cuinterp3d_nuptsdriven<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan,
-                                            int blksize);
+template int cuinterp3d_nuptsdriven<float>(int nf1, int nf2, int nf3, int M,
+                                           cufinufft_plan_t<float> *d_plan, int blksize);
+template int cuinterp3d_nuptsdriven<double>(
+    int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan, int blksize);
 
-template int cuinterp3d_subprob<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan, int blksize);
-template int cuinterp3d_subprob<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan,
-                                        int blksize);
+template int cuinterp3d_subprob<float>(int nf1, int nf2, int nf3, int M,
+                                       cufinufft_plan_t<float> *d_plan, int blksize);
+template int cuinterp3d_subprob<double>(int nf1, int nf2, int nf3, int M,
+                                        cufinufft_plan_t<double> *d_plan, int blksize);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu
index 13d435e28..b54bcaec1 100644
--- a/src/cuda/3d/spread3d_wrapper.cu
+++ b/src/cuda/3d/spread3d_wrapper.cu
@@ -18,7 +18,7 @@ using namespace cufinufft::memtransfer;
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     A wrapper for different spreading methods.
@@ -31,521 +31,551 @@ int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int M = d_plan->M;
-
-    int ier = 0;
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        ier = cuspread3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    case 2: {
-        ier = cuspread3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    case 4: {
-        ier = cuspread3d_blockgather<T>(nf1, nf2, nf3, M, d_plan, blksize);
-    } break;
-    default:
-        std::cerr << "[cuspread3d] error: incorrect method, should be 1,2,4" << std::endl;
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
-
-    return ier;
-}
-
-template <typename T>
-int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
-
-    if (d_plan->opts.gpu_sort) {
-        int bin_size_x = d_plan->opts.gpu_binsizex;
-        int bin_size_y = d_plan->opts.gpu_binsizey;
-        int bin_size_z = d_plan->opts.gpu_binsizez;
-        if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
-            std::cerr << "[cuspread3d_nuptsdriven_prop] error: invalid binsize (binsizex, binsizey, binsizez) = (";
-            std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
-            return FINUFFT_ERR_BINSIZE_NOTVALID;
-        }
-
-        int numbins[3];
-        numbins[0] = ceil((T)nf1 / bin_size_x);
-        numbins[1] = ceil((T)nf2 / bin_size_y);
-        numbins[2] = ceil((T)nf3 / bin_size_z);
-
-        T *d_kx = d_plan->kx;
-        T *d_ky = d_plan->ky;
-        T *d_kz = d_plan->kz;
-
-        int *d_binsize = d_plan->binsize;
-        int *d_binstartpts = d_plan->binstartpts;
-        int *d_sortidx = d_plan->sortidx;
-        int *d_idxnupts = d_plan->idxnupts;
-
-        int ier;
-        if ((ier = checkCudaErrors(
-                 cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
-            return ier;
-        calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binsize, d_kx,
-            d_ky, d_kz, d_sortidx);
-        RETURN_IF_CUDA_ERROR
-
-        int n = numbins[0] * numbins[1] * numbins[2];
-        thrust::device_ptr<int> d_ptr(d_binsize);
-        thrust::device_ptr<int> d_result(d_binstartpts);
-        thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-        calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-            M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binstartpts, d_sortidx, d_kx,
-            d_ky, d_kz, d_idxnupts, nf1, nf2, nf3);
-        RETURN_IF_CUDA_ERROR
-    } else {
-        int *d_idxnupts = d_plan->idxnupts;
-
-        trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, d_idxnupts);
-        RETURN_IF_CUDA_ERROR
-    }
-
-    return 0;
-}
-
-template <typename T>
-int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    T sigma = d_plan->spopts.upsampfac;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-
-    int *d_idxnupts = d_plan->idxnupts;
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
-
-    threadsPerBlock.x = 16;
-    threadsPerBlock.y = 1;
-    blocks.x = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
-    blocks.y = 1;
-
-    if (d_plan->opts.gpu_kerevalmeth == 1) {
-        for (int t = 0; t < blksize; t++) {
-            spread_3d_nupts_driven<T, 1>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M,
-                                                         ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    } else {
-        for (int t = 0; t < blksize; t++) {
-            spread_3d_nupts_driven<T, 0>
-                <<<blocks, threadsPerBlock, 0, stream>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M,
-                                                         ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
-    }
-
-    return 0;
+  int nf1 = d_plan->nf1;
+  int nf2 = d_plan->nf2;
+  int nf3 = d_plan->nf3;
+  int M   = d_plan->M;
+
+  int ier = 0;
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    ier = cuspread3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  case 2: {
+    ier = cuspread3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  case 4: {
+    ier = cuspread3d_blockgather<T>(nf1, nf2, nf3, M, d_plan, blksize);
+  } break;
+  default:
+    std::cerr << "[cuspread3d] error: incorrect method, should be 1,2,4" << std::endl;
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
+
+  return ier;
 }
 
-template <typename T>
-int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
-
-    dim3 threadsPerBlock;
-    dim3 blocks;
-
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-    int o_bin_size_x = d_plan->opts.gpu_obinsizex;
-    int o_bin_size_y = d_plan->opts.gpu_obinsizey;
-    int o_bin_size_z = d_plan->opts.gpu_obinsizez;
-
-    int numobins[3];
-    if (nf1 % o_bin_size_x != 0 || nf2 % o_bin_size_y != 0 || nf3 % o_bin_size_z != 0) {
-        std::cerr << "[cuspread3d_blockgather_prop] error:\n";
-        std::cerr << "       mod(nf(1|2|3), opts.gpu_obinsize(x|y|z)) != 0" << std::endl;
-        std::cerr << "       (nf1, nf2, nf3) = (" << nf1 << ", " << nf2 << ", " << nf3 << ")" << std::endl;
-        std::cerr << "       (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", "
-                  << o_bin_size_z << ")" << std::endl;
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
-    }
-
-    numobins[0] = ceil((T)nf1 / o_bin_size_x);
-    numobins[1] = ceil((T)nf2 / o_bin_size_y);
-    numobins[2] = ceil((T)nf3 / o_bin_size_z);
+template<typename T>
+int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M,
+                                cufinufft_plan_t<T> *d_plan) {
+  auto &stream = d_plan->stream;
 
+  if (d_plan->opts.gpu_sort) {
     int bin_size_x = d_plan->opts.gpu_binsizex;
     int bin_size_y = d_plan->opts.gpu_binsizey;
     int bin_size_z = d_plan->opts.gpu_binsizez;
-    if (o_bin_size_x % bin_size_x != 0 || o_bin_size_y % bin_size_y != 0 || o_bin_size_z % bin_size_z != 0) {
-        std::cerr << "[cuspread3d_blockgather_prop] error:\n";
-        std::cerr << "      mod(ops.gpu_obinsize(x|y|z), opts.gpu_binsize(x|y|z)) != 0" << std::endl;
-        std::cerr << "      (binsizex, binsizey, binsizez) = (" << bin_size_x << ", " << bin_size_y << ", "
-                  << bin_size_z << ")" << std::endl;
-        std::cerr << "      (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", "
-                  << o_bin_size_z << ")" << std::endl;
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
+    if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
+      std::cerr << "[cuspread3d_nuptsdriven_prop] error: invalid binsize "
+                   "(binsizex, binsizey, binsizez) = (";
+      std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")"
+                << std::endl;
+      return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
 
-    int binsperobinx, binsperobiny, binsperobinz;
     int numbins[3];
-    binsperobinx = o_bin_size_x / bin_size_x + 2;
-    binsperobiny = o_bin_size_y / bin_size_y + 2;
-    binsperobinz = o_bin_size_z / bin_size_z + 2;
-    numbins[0] = numobins[0] * (binsperobinx);
-    numbins[1] = numobins[1] * (binsperobiny);
-    numbins[2] = numobins[2] * (binsperobinz);
+    numbins[0] = ceil((T)nf1 / bin_size_x);
+    numbins[1] = ceil((T)nf2 / bin_size_y);
+    numbins[2] = ceil((T)nf3 / bin_size_z);
 
     T *d_kx = d_plan->kx;
     T *d_ky = d_plan->ky;
     T *d_kz = d_plan->kz;
 
-    int *d_binsize = d_plan->binsize;
-    int *d_sortidx = d_plan->sortidx;
+    int *d_binsize     = d_plan->binsize;
     int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_idxnupts = NULL;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_subprob_to_bin = NULL;
+    int *d_sortidx     = d_plan->sortidx;
+    int *d_idxnupts    = d_plan->idxnupts;
 
     int ier;
-    if ((ier = checkCudaErrors(
-             cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
-        return ier;
-
-    locate_nupts_to_bins_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], binsperobinx, binsperobiny,
-        binsperobinz, d_binsize, d_kx, d_ky, d_kz, d_sortidx, nf1, nf2, nf3);
-    RETURN_IF_CUDA_ERROR
-
-    threadsPerBlock.x = 8;
-    threadsPerBlock.y = 8;
-    threadsPerBlock.z = 8;
-
-    blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
-    blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
-    blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
-
-    fill_ghost_bins<<<blocks, threadsPerBlock, 0, stream>>>(binsperobinx, binsperobiny, binsperobinz, numobins[0],
-                                                            numobins[1], numobins[2], d_binsize);
+    if ((ier = checkCudaErrors(cudaMemsetAsync(
+             d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
+      return ier;
+    calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+        M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1],
+        numbins[2], d_binsize, d_kx, d_ky, d_kz, d_sortidx);
     RETURN_IF_CUDA_ERROR
 
     int n = numbins[0] * numbins[1] * numbins[2];
     thrust::device_ptr<int> d_ptr(d_binsize);
-    thrust::device_ptr<int> d_result(d_binstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_binstartpts, 0, sizeof(int), stream))))
-        return ier;
-
-    int totalNUpts;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(&totalNUpts, &d_binstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream))))
-        return ier;
-    cudaStreamSynchronize(stream);
-    if ((ier = checkCudaErrors(cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream))))
-        return ier;
-
-    calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], binsperobinx, binsperobiny,
-        binsperobinz, d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_idxnupts);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
-
-    threadsPerBlock.x = 2;
-    threadsPerBlock.y = 2;
-    threadsPerBlock.z = 2;
-
-    blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
-    blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
-    blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
-
-    ghost_bin_pts_index<<<blocks, threadsPerBlock, 0, stream>>>(binsperobinx, binsperobiny, binsperobinz, numobins[0],
-                                                                numobins[1], numobins[2], d_binsize, d_idxnupts,
-                                                                d_binstartpts, M);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_idxnupts);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
-
-    cudaFree(d_plan->idxnupts);
-    d_plan->idxnupts = d_idxnupts;
+    thrust::device_ptr<int> d_result(d_binstartpts);
+    thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
-    /* --------------------------------------------- */
-    //        Determining Subproblem properties      //
-    /* --------------------------------------------- */
-    n = numobins[0] * numobins[1] * numobins[2];
-    calc_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(binsperobinx, binsperobiny, binsperobinz, d_binsize,
-                                                                   d_numsubprob, maxsubprobsize,
-                                                                   numobins[0] * numobins[1] * numobins[2]);
+    calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+        M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2],
+        d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3);
     RETURN_IF_CUDA_ERROR
+  } else {
+    int *d_idxnupts = d_plan->idxnupts;
 
-    n = numobins[0] * numobins[1] * numobins[2];
-    d_ptr = thrust::device_pointer_cast(d_numsubprob);
-    d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
-        return ier;
-
-    int totalnumsubprob;
-    if ((ier = checkCudaErrors(
-             cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream))))
-        return ier;
-    cudaStreamSynchronize(stream);
-    if ((ier = checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
-        return ier;
-    map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(d_subprob_to_bin, d_subprobstartpts,
-                                                                         d_numsubprob, n);
-    err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_subprob_to_bin);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
-
-    assert(d_subprob_to_bin != NULL);
-    cudaFree(d_plan->subprob_to_bin);
-    d_plan->subprob_to_bin = d_subprob_to_bin;
-    d_plan->totalnumsubprob = totalnumsubprob;
+    trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M,
+                                                                             d_idxnupts);
+    RETURN_IF_CUDA_ERROR
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    T sigma = d_plan->spopts.upsampfac;
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    int obin_size_x = d_plan->opts.gpu_obinsizex;
-    int obin_size_y = d_plan->opts.gpu_obinsizey;
-    int obin_size_z = d_plan->opts.gpu_obinsizez;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int bin_size_z = d_plan->opts.gpu_binsizez;
-    int numobins[3];
-    numobins[0] = ceil((T)nf1 / obin_size_x);
-    numobins[1] = ceil((T)nf2 / obin_size_y);
-    numobins[2] = ceil((T)nf3 / obin_size_z);
+template<typename T>
+int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
 
-    int binsperobinx, binsperobiny, binsperobinz;
-    binsperobinx = obin_size_x / bin_size_x + 2;
-    binsperobiny = obin_size_y / bin_size_y + 2;
-    binsperobinz = obin_size_z / bin_size_z + 2;
+  dim3 threadsPerBlock;
+  dim3 blocks;
 
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
+  int ns    = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  T sigma   = d_plan->spopts.upsampfac;
+  T es_c    = d_plan->spopts.ES_c;
+  T es_beta = d_plan->spopts.ES_beta;
 
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+  int *d_idxnupts       = d_plan->idxnupts;
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
 
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
+  threadsPerBlock.x = 16;
+  threadsPerBlock.y = 1;
+  blocks.x          = (M + threadsPerBlock.x - 1) / threadsPerBlock.x;
+  blocks.y          = 1;
 
-    size_t sharedplanorysize = obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuspread3d_blockgather] error: not enough shared memory" << std::endl;
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  if (d_plan->opts.gpu_kerevalmeth == 1) {
+    for (int t = 0; t < blksize; t++) {
+      spread_3d_nupts_driven<T, 1><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
-
+  } else {
     for (int t = 0; t < blksize; t++) {
-        if (d_plan->opts.gpu_kerevalmeth == 1) {
-            spread_3d_block_gather<T, 1><<<totalnumsubprob, 64, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
-                d_binstartpts, obin_size_x, obin_size_y, obin_size_z, binsperobinx * binsperobiny * binsperobinz,
-                d_subprob_to_bin, d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        } else {
-            spread_3d_block_gather<T, 0><<<totalnumsubprob, 64, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
-                d_binstartpts, obin_size_x, obin_size_y, obin_size_z, binsperobinx * binsperobiny * binsperobinz,
-                d_subprob_to_bin, d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+      spread_3d_nupts_driven<T, 0><<<blocks, threadsPerBlock, 0, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
-int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan) {
-    auto &stream = d_plan->stream;
+template<typename T>
+int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M,
+                                cufinufft_plan_t<T> *d_plan) {
+  auto &stream = d_plan->stream;
+
+  dim3 threadsPerBlock;
+  dim3 blocks;
+
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int o_bin_size_x   = d_plan->opts.gpu_obinsizex;
+  int o_bin_size_y   = d_plan->opts.gpu_obinsizey;
+  int o_bin_size_z   = d_plan->opts.gpu_obinsizez;
+
+  int numobins[3];
+  if (nf1 % o_bin_size_x != 0 || nf2 % o_bin_size_y != 0 || nf3 % o_bin_size_z != 0) {
+    std::cerr << "[cuspread3d_blockgather_prop] error:\n";
+    std::cerr << "       mod(nf(1|2|3), opts.gpu_obinsize(x|y|z)) != 0" << std::endl;
+    std::cerr << "       (nf1, nf2, nf3) = (" << nf1 << ", " << nf2 << ", " << nf3 << ")"
+              << std::endl;
+    std::cerr << "       (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", "
+              << o_bin_size_y << ", " << o_bin_size_z << ")" << std::endl;
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
+
+  numobins[0] = ceil((T)nf1 / o_bin_size_x);
+  numobins[1] = ceil((T)nf2 / o_bin_size_y);
+  numobins[2] = ceil((T)nf3 / o_bin_size_z);
+
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int bin_size_z = d_plan->opts.gpu_binsizez;
+  if (o_bin_size_x % bin_size_x != 0 || o_bin_size_y % bin_size_y != 0 ||
+      o_bin_size_z % bin_size_z != 0) {
+    std::cerr << "[cuspread3d_blockgather_prop] error:\n";
+    std::cerr << "      mod(ops.gpu_obinsize(x|y|z), opts.gpu_binsize(x|y|z)) != 0"
+              << std::endl;
+    std::cerr << "      (binsizex, binsizey, binsizez) = (" << bin_size_x << ", "
+              << bin_size_y << ", " << bin_size_z << ")" << std::endl;
+    std::cerr << "      (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", "
+              << o_bin_size_y << ", " << o_bin_size_z << ")" << std::endl;
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
+
+  int binsperobinx, binsperobiny, binsperobinz;
+  int numbins[3];
+  binsperobinx = o_bin_size_x / bin_size_x + 2;
+  binsperobiny = o_bin_size_y / bin_size_y + 2;
+  binsperobinz = o_bin_size_z / bin_size_z + 2;
+  numbins[0]   = numobins[0] * (binsperobinx);
+  numbins[1]   = numobins[1] * (binsperobiny);
+  numbins[2]   = numobins[2] * (binsperobinz);
+
+  T *d_kx = d_plan->kx;
+  T *d_ky = d_plan->ky;
+  T *d_kz = d_plan->kz;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_sortidx         = d_plan->sortidx;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_idxnupts        = NULL;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_subprob_to_bin  = NULL;
+
+  int ier;
+  if ((ier = checkCudaErrors(cudaMemsetAsync(
+           d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
+    return ier;
 
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int bin_size_z = d_plan->opts.gpu_binsizez;
-    if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
-        std::cerr << "error: invalid binsize (binsizex, binsizey, binsizez) = (";
-        std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
-        return FINUFFT_ERR_BINSIZE_NOTVALID;
-    }
+  locate_nupts_to_bins_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2],
+      binsperobinx, binsperobiny, binsperobinz, d_binsize, d_kx, d_ky, d_kz, d_sortidx,
+      nf1, nf2, nf3);
+  RETURN_IF_CUDA_ERROR
 
-    int numbins[3];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-    numbins[2] = ceil((T)nf3 / bin_size_z);
+  threadsPerBlock.x = 8;
+  threadsPerBlock.y = 8;
+  threadsPerBlock.z = 8;
 
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
+  blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
+  blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
+  blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
 
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_sortidx = d_plan->sortidx;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+  fill_ghost_bins<<<blocks, threadsPerBlock, 0, stream>>>(
+      binsperobinx, binsperobiny, binsperobinz, numobins[0], numobins[1], numobins[2],
+      d_binsize);
+  RETURN_IF_CUDA_ERROR
 
-    int *d_subprob_to_bin = NULL;
+  int n = numbins[0] * numbins[1] * numbins[2];
+  thrust::device_ptr<int> d_ptr(d_binsize);
+  thrust::device_ptr<int> d_result(d_binstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
 
-    int ier;
-    if ((ier = checkCudaErrors(
-             cudaMemsetAsync(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
-        return ier;
-    calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binsize, d_kx, d_ky,
-        d_kz, d_sortidx);
-    RETURN_IF_CUDA_ERROR
-
-    int n = numbins[0] * numbins[1] * numbins[2];
-    thrust::device_ptr<int> d_ptr(d_binsize);
-    thrust::device_ptr<int> d_result(d_binstartpts);
-    thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-
-    calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binstartpts, d_sortidx, d_kx, d_ky,
-        d_kz, d_idxnupts, nf1, nf2, nf3);
-    RETURN_IF_CUDA_ERROR
-    /* --------------------------------------------- */
-    //        Determining Subproblem properties      //
-    /* --------------------------------------------- */
-    calc_subprob_3d_v2<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(d_binsize, d_numsubprob, maxsubprobsize,
-                                                                   numbins[0] * numbins[1] * numbins[2]);
-    RETURN_IF_CUDA_ERROR
+  if ((ier = checkCudaErrors(cudaMemsetAsync(d_binstartpts, 0, sizeof(int), stream))))
+    return ier;
 
-    d_ptr = thrust::device_pointer_cast(d_numsubprob);
-    d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
-    thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
-    int totalnumsubprob;
-    if (checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)) ||
-        checkCudaErrors(
-            cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost, stream)
-            )
-        )
-        return FINUFFT_ERR_CUDA_FAILURE;
-    cudaStreamSynchronize(stream);
-    if(checkCudaErrors(cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))
-        return FINUFFT_ERR_CUDA_FAILURE;
-
-    map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1] * numbins[2]);
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
-        cudaFree(d_subprob_to_bin);
-        return FINUFFT_ERR_CUDA_FAILURE;
-    }
+  int totalNUpts;
+  if ((ier = checkCudaErrors(cudaMemcpyAsync(&totalNUpts, &d_binstartpts[n], sizeof(int),
+                                             cudaMemcpyDeviceToHost, stream))))
+    return ier;
+  cudaStreamSynchronize(stream);
+  if ((ier = checkCudaErrors(
+           cudaMallocAsync(&d_idxnupts, totalNUpts * sizeof(int), stream))))
+    return ier;
 
-    assert(d_subprob_to_bin != NULL);
-    if (d_plan->subprob_to_bin != NULL)
-        cudaFree(d_plan->subprob_to_bin);
-    d_plan->subprob_to_bin = d_subprob_to_bin;
-    assert(d_plan->subprob_to_bin != nullptr);
-    d_plan->totalnumsubprob = totalnumsubprob;
+  calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2],
+      binsperobinx, binsperobiny, binsperobinz, d_binstartpts, d_sortidx, d_kx, d_ky,
+      d_kz, d_idxnupts, nf1, nf2, nf3);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_idxnupts);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  threadsPerBlock.x = 2;
+  threadsPerBlock.y = 2;
+  threadsPerBlock.z = 2;
+
+  blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x;
+  blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y;
+  blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z;
+
+  ghost_bin_pts_index<<<blocks, threadsPerBlock, 0, stream>>>(
+      binsperobinx, binsperobiny, binsperobinz, numobins[0], numobins[1], numobins[2],
+      d_binsize, d_idxnupts, d_binstartpts, M);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_idxnupts);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  cudaFree(d_plan->idxnupts);
+  d_plan->idxnupts = d_idxnupts;
+
+  /* --------------------------------------------- */
+  //        Determining Subproblem properties      //
+  /* --------------------------------------------- */
+  n = numobins[0] * numobins[1] * numobins[2];
+  calc_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      binsperobinx, binsperobiny, binsperobinz, d_binsize, d_numsubprob, maxsubprobsize,
+      numobins[0] * numobins[1] * numobins[2]);
+  RETURN_IF_CUDA_ERROR
+
+  n        = numobins[0] * numobins[1] * numobins[2];
+  d_ptr    = thrust::device_pointer_cast(d_numsubprob);
+  d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+
+  if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream))))
+    return ier;
 
-    return 0;
+  int totalnumsubprob;
+  if ((ier =
+           checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
+                                           sizeof(int), cudaMemcpyDeviceToHost, stream))))
+    return ier;
+  cudaStreamSynchronize(stream);
+  if ((ier = checkCudaErrors(
+           cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream))))
+    return ier;
+  map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_subprob_to_bin, d_subprobstartpts, d_numsubprob, n);
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_subprob_to_bin);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  assert(d_subprob_to_bin != NULL);
+  cudaFree(d_plan->subprob_to_bin);
+  d_plan->subprob_to_bin  = d_subprob_to_bin;
+  d_plan->totalnumsubprob = totalnumsubprob;
+
+  return 0;
 }
 
-template <typename T>
-int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
-    auto &stream = d_plan->stream;
-
-    int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
-    int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
-
-    // assume that bin_size_x > ns/2;
-    int bin_size_x = d_plan->opts.gpu_binsizex;
-    int bin_size_y = d_plan->opts.gpu_binsizey;
-    int bin_size_z = d_plan->opts.gpu_binsizez;
-    int numbins[3];
-    numbins[0] = ceil((T)nf1 / bin_size_x);
-    numbins[1] = ceil((T)nf2 / bin_size_y);
-    numbins[2] = ceil((T)nf3 / bin_size_z);
-
-    T *d_kx = d_plan->kx;
-    T *d_ky = d_plan->ky;
-    T *d_kz = d_plan->kz;
-    cuda_complex<T> *d_c = d_plan->c;
-    cuda_complex<T> *d_fw = d_plan->fw;
+template<typename T>
+int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                           int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns             = d_plan->spopts.nspread;
+  T es_c             = d_plan->spopts.ES_c;
+  T es_beta          = d_plan->spopts.ES_beta;
+  T sigma            = d_plan->spopts.upsampfac;
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  int obin_size_x = d_plan->opts.gpu_obinsizex;
+  int obin_size_y = d_plan->opts.gpu_obinsizey;
+  int obin_size_z = d_plan->opts.gpu_obinsizez;
+  int bin_size_x  = d_plan->opts.gpu_binsizex;
+  int bin_size_y  = d_plan->opts.gpu_binsizey;
+  int bin_size_z  = d_plan->opts.gpu_binsizez;
+  int numobins[3];
+  numobins[0] = ceil((T)nf1 / obin_size_x);
+  numobins[1] = ceil((T)nf2 / obin_size_y);
+  numobins[2] = ceil((T)nf3 / obin_size_z);
+
+  int binsperobinx, binsperobiny, binsperobinz;
+  binsperobinx = obin_size_x / bin_size_x + 2;
+  binsperobiny = obin_size_y / bin_size_y + 2;
+  binsperobinz = obin_size_z / bin_size_z + 2;
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int totalnumsubprob   = d_plan->totalnumsubprob;
+  int *d_subprob_to_bin = d_plan->subprob_to_bin;
+
+  size_t sharedplanorysize =
+      obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuspread3d_blockgather] error: not enough shared memory" << std::endl;
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  for (int t = 0; t < blksize; t++) {
+    if (d_plan->opts.gpu_kerevalmeth == 1) {
+      spread_3d_block_gather<T, 1><<<totalnumsubprob, 64, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z,
+          binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, d_subprobstartpts,
+          maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    } else {
+      spread_3d_block_gather<T, 0><<<totalnumsubprob, 64, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          es_c, es_beta, sigma, d_binstartpts, obin_size_x, obin_size_y, obin_size_z,
+          binsperobinx * binsperobiny * binsperobinz, d_subprob_to_bin, d_subprobstartpts,
+          maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    }
+  }
 
-    int *d_binsize = d_plan->binsize;
-    int *d_binstartpts = d_plan->binstartpts;
-    int *d_numsubprob = d_plan->numsubprob;
-    int *d_subprobstartpts = d_plan->subprobstartpts;
-    int *d_idxnupts = d_plan->idxnupts;
+  return 0;
+}
 
-    int totalnumsubprob = d_plan->totalnumsubprob;
-    int *d_subprob_to_bin = d_plan->subprob_to_bin;
-
-    T sigma = d_plan->spopts.upsampfac;
-    T es_c = d_plan->spopts.ES_c;
-    T es_beta = d_plan->spopts.ES_beta;
-    size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) *
-                               (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
-    if (sharedplanorysize > 49152) {
-        std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" << sharedplanorysize << ")" << std::endl;
-        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
-    }
+template<typename T>
+int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M,
+                            cufinufft_plan_t<T> *d_plan) {
+  auto &stream = d_plan->stream;
+
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+  int bin_size_x     = d_plan->opts.gpu_binsizex;
+  int bin_size_y     = d_plan->opts.gpu_binsizey;
+  int bin_size_z     = d_plan->opts.gpu_binsizez;
+  if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
+    std::cerr << "error: invalid binsize (binsizex, binsizey, binsizez) = (";
+    std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
+    return FINUFFT_ERR_BINSIZE_NOTVALID;
+  }
+
+  int numbins[3];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+  numbins[2] = ceil((T)nf3 / bin_size_z);
+
+  T *d_kx = d_plan->kx;
+  T *d_ky = d_plan->ky;
+  T *d_kz = d_plan->kz;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_sortidx         = d_plan->sortidx;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int *d_subprob_to_bin = NULL;
+
+  int ier;
+  if ((ier = checkCudaErrors(cudaMemsetAsync(
+           d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int), stream))))
+    return ier;
+  calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1],
+      numbins[2], d_binsize, d_kx, d_ky, d_kz, d_sortidx);
+  RETURN_IF_CUDA_ERROR
+
+  int n = numbins[0] * numbins[1] * numbins[2];
+  thrust::device_ptr<int> d_ptr(d_binsize);
+  thrust::device_ptr<int> d_result(d_binstartpts);
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+
+  calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2],
+      d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, nf1, nf2, nf3);
+  RETURN_IF_CUDA_ERROR
+  /* --------------------------------------------- */
+  //        Determining Subproblem properties      //
+  /* --------------------------------------------- */
+  calc_subprob_3d_v2<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      d_binsize, d_numsubprob, maxsubprobsize, numbins[0] * numbins[1] * numbins[2]);
+  RETURN_IF_CUDA_ERROR
+
+  d_ptr    = thrust::device_pointer_cast(d_numsubprob);
+  d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
+  thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result);
+  int totalnumsubprob;
+  if (checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)) ||
+      checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n],
+                                      sizeof(int), cudaMemcpyDeviceToHost, stream)))
+    return FINUFFT_ERR_CUDA_FAILURE;
+  cudaStreamSynchronize(stream);
+  if (checkCudaErrors(
+          cudaMallocAsync(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream)))
+    return FINUFFT_ERR_CUDA_FAILURE;
+
+  map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024, 0,
+                             stream>>>(d_subprob_to_bin, d_subprobstartpts, d_numsubprob,
+                                       numbins[0] * numbins[1] * numbins[2]);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+    cudaFree(d_subprob_to_bin);
+    return FINUFFT_ERR_CUDA_FAILURE;
+  }
+
+  assert(d_subprob_to_bin != NULL);
+  if (d_plan->subprob_to_bin != NULL) cudaFree(d_plan->subprob_to_bin);
+  d_plan->subprob_to_bin = d_subprob_to_bin;
+  assert(d_plan->subprob_to_bin != nullptr);
+  d_plan->totalnumsubprob = totalnumsubprob;
+
+  return 0;
+}
 
-    for (int t = 0; t < blksize; t++) {
-        if (d_plan->opts.gpu_kerevalmeth) {
-            spread_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta,
-                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
-                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        } else {
-            spread_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta,
-                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
-                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts);
-            RETURN_IF_CUDA_ERROR
-        }
+template<typename T>
+int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan,
+                       int blksize) {
+  auto &stream = d_plan->stream;
+
+  int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
+  int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
+
+  // assume that bin_size_x > ns/2;
+  int bin_size_x = d_plan->opts.gpu_binsizex;
+  int bin_size_y = d_plan->opts.gpu_binsizey;
+  int bin_size_z = d_plan->opts.gpu_binsizez;
+  int numbins[3];
+  numbins[0] = ceil((T)nf1 / bin_size_x);
+  numbins[1] = ceil((T)nf2 / bin_size_y);
+  numbins[2] = ceil((T)nf3 / bin_size_z);
+
+  T *d_kx               = d_plan->kx;
+  T *d_ky               = d_plan->ky;
+  T *d_kz               = d_plan->kz;
+  cuda_complex<T> *d_c  = d_plan->c;
+  cuda_complex<T> *d_fw = d_plan->fw;
+
+  int *d_binsize         = d_plan->binsize;
+  int *d_binstartpts     = d_plan->binstartpts;
+  int *d_numsubprob      = d_plan->numsubprob;
+  int *d_subprobstartpts = d_plan->subprobstartpts;
+  int *d_idxnupts        = d_plan->idxnupts;
+
+  int totalnumsubprob   = d_plan->totalnumsubprob;
+  int *d_subprob_to_bin = d_plan->subprob_to_bin;
+
+  T sigma                  = d_plan->spopts.upsampfac;
+  T es_c                   = d_plan->spopts.ES_c;
+  T es_beta                = d_plan->spopts.ES_beta;
+  size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) *
+                             (bin_size_y + 2 * ceil(ns / 2.0)) *
+                             (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
+  if (sharedplanorysize > 49152) {
+    std::cerr << "[cuspread3d_subprob] error: not enough shared memory ("
+              << sharedplanorysize << ")" << std::endl;
+    return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+  }
+
+  for (int t = 0; t < blksize; t++) {
+    if (d_plan->opts.gpu_kerevalmeth) {
+      spread_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
+          bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+          numbins[0], numbins[1], numbins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
+    } else {
+      spread_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize, stream>>>(
+          d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+          sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y,
+          bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+          numbins[0], numbins[1], numbins[2], d_idxnupts);
+      RETURN_IF_CUDA_ERROR
     }
+  }
 
-    return 0;
+  return 0;
 }
 
 template int cuspread3d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread3d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cuspread3d_nuptsdriven_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread3d_nuptsdriven_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread3d_subprob_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread3d_subprob_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
-template int cuspread3d_blockgather_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
-template int cuspread3d_blockgather_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread3d_nuptsdriven_prop<float>(int nf1, int nf2, int nf3, int M,
+                                                cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_nuptsdriven_prop<double>(int nf1, int nf2, int nf3, int M,
+                                                 cufinufft_plan_t<double> *d_plan);
+template int cuspread3d_subprob_prop<float>(int nf1, int nf2, int nf3, int M,
+                                            cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_subprob_prop<double>(int nf1, int nf2, int nf3, int M,
+                                             cufinufft_plan_t<double> *d_plan);
+template int cuspread3d_blockgather_prop<float>(int nf1, int nf2, int nf3, int M,
+                                                cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_blockgather_prop<double>(int nf1, int nf2, int nf3, int M,
+                                                 cufinufft_plan_t<double> *d_plan);
 
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index a83688693..c6bf8315d 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -25,39 +25,42 @@ using std::max;
    cnufftspread's real symmetric kernel. */
 // a , f are intermediate results from function onedim_fseries_kernel_precomp()
 // (see cufinufft/contrib/common.cpp for description)
-template <typename T>
-__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f, cuDoubleComplex *a, T *fwkerhalf1,
-                                       T *fwkerhalf2, T *fwkerhalf3, int ns) {
-    T J2 = ns / 2.0;
-    int q = (int)(2 + 3.0 * J2);
-    int nf;
-    cuDoubleComplex *at = a + threadIdx.y * MAX_NQUAD;
-    T *ft = f + threadIdx.y * MAX_NQUAD;
-    T *oarr;
-    if (threadIdx.y == 0) {
-        oarr = fwkerhalf1;
-        nf = nf1;
-    } else if (threadIdx.y == 1) {
-        oarr = fwkerhalf2;
-        nf = nf2;
-    } else {
-        oarr = fwkerhalf3;
-        nf = nf3;
-    }
-
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf / 2 + 1; i += blockDim.x * gridDim.x) {
-        int brk = 0.5 + i;
-        T x = 0.0;
-        for (int n = 0; n < q; n++) {
-            x += ft[n] * 2 * (pow(cabs(at[n]), brk) * cos(brk * carg(at[n])));
-        }
-        oarr[i] = x;
+template<typename T>
+__global__ void fseries_kernel_compute(int nf1, int nf2, int nf3, T *f,
+                                       cuDoubleComplex *a, T *fwkerhalf1, T *fwkerhalf2,
+                                       T *fwkerhalf3, int ns) {
+  T J2  = ns / 2.0;
+  int q = (int)(2 + 3.0 * J2);
+  int nf;
+  cuDoubleComplex *at = a + threadIdx.y * MAX_NQUAD;
+  T *ft               = f + threadIdx.y * MAX_NQUAD;
+  T *oarr;
+  if (threadIdx.y == 0) {
+    oarr = fwkerhalf1;
+    nf   = nf1;
+  } else if (threadIdx.y == 1) {
+    oarr = fwkerhalf2;
+    nf   = nf2;
+  } else {
+    oarr = fwkerhalf3;
+    nf   = nf3;
+  }
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < nf / 2 + 1;
+       i += blockDim.x * gridDim.x) {
+    int brk = 0.5 + i;
+    T x     = 0.0;
+    for (int n = 0; n < q; n++) {
+      x += ft[n] * 2 * (pow(cabs(at[n]), brk) * cos(brk * carg(at[n])));
     }
+    oarr[i] = x;
+  }
 }
 
-template <typename T>
-int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleComplex *d_a, T *d_fwkerhalf1,
-                           T *d_fwkerhalf2, T *d_fwkerhalf3, int ns, cudaStream_t stream)
+template<typename T>
+int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f,
+                           cuDoubleComplex *d_a, T *d_fwkerhalf1, T *d_fwkerhalf2,
+                           T *d_fwkerhalf3, int ns, cudaStream_t stream)
 /*
     wrapper for approximation of Fourier series of real symmetric spreading
     kernel.
@@ -65,44 +68,43 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleC
     Melody Shih 2/20/22
 */
 {
-    int nout = max(max(nf1 / 2 + 1, nf2 / 2 + 1), nf3 / 2 + 1);
+  int nout = max(max(nf1 / 2 + 1, nf2 / 2 + 1), nf3 / 2 + 1);
 
-    dim3 threadsPerBlock(16, dim);
-    dim3 numBlocks((nout + 16 - 1) / 16, 1);
+  dim3 threadsPerBlock(16, dim);
+  dim3 numBlocks((nout + 16 - 1) / 16, 1);
 
-    fseries_kernel_compute<<<numBlocks, threadsPerBlock, 0, stream>>>(nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1,
-                                                                      d_fwkerhalf2, d_fwkerhalf3, ns);
-    RETURN_IF_CUDA_ERROR
+  fseries_kernel_compute<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3, ns);
+  RETURN_IF_CUDA_ERROR
 
-    return 0;
+  return 0;
 }
 
-template <typename T>
+template<typename T>
 int setup_spreader_for_nufft(finufft_spread_opts &spopts, T eps, cufinufft_opts opts)
 // Set up the spreader parameters given eps, and pass across various nufft
 // options. Report status of setup_spreader.  Barnett 10/30/17
 {
-    int ier = setup_spreader(spopts, eps, (T)opts.upsampfac, opts.gpu_kerevalmeth);
-    return ier;
+  int ier = setup_spreader(spopts, eps, (T)opts.upsampfac, opts.gpu_kerevalmeth);
+  return ier;
 }
 
-void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts, CUFINUFFT_BIGINT *nf,
-                   CUFINUFFT_BIGINT bs)
+void set_nf_type12(CUFINUFFT_BIGINT ms, cufinufft_opts opts, finufft_spread_opts spopts,
+                   CUFINUFFT_BIGINT *nf, CUFINUFFT_BIGINT bs)
 // type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts
 // and requested number of Fourier modes ms.
 {
-    *nf = (CUFINUFFT_BIGINT)(opts.upsampfac * ms);
-    if (*nf < 2 * spopts.nspread)
-        *nf = 2 * spopts.nspread; // otherwise spread fails
-    if (*nf < MAX_NF) {           // otherwise will fail anyway
-        if (opts.gpu_method == 4) // expensive at huge nf
-            *nf = utils::next235beven(*nf, bs);
-        else
-            *nf = utils::next235beven(*nf, 1);
-    }
+  *nf = (CUFINUFFT_BIGINT)(opts.upsampfac * ms);
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
+  if (*nf < MAX_NF) {                                     // otherwise will fail anyway
+    if (opts.gpu_method == 4)                             // expensive at huge nf
+      *nf = utils::next235beven(*nf, bs);
+    else
+      *nf = utils::next235beven(*nf, 1);
+  }
 }
 
-template <typename T>
+template<typename T>
 void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opts opts)
 /*
   Approximates exact Fourier series coeffs of cnufftspread's real symmetric
@@ -129,10 +131,10 @@ void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opt
   Melody 2/20/22 separate into precomp & comp functions defined below.
  */
 {
-    T f[MAX_NQUAD];
-    std::complex<double> a[MAX_NQUAD];
-    onedim_fseries_kernel_precomp(nf, f, a, opts);
-    onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts);
+  T f[MAX_NQUAD];
+  std::complex<double> a[MAX_NQUAD];
+  onedim_fseries_kernel_precomp(nf, f, a, opts);
+  onedim_fseries_kernel_compute(nf, f, a, fwkerhalf, opts);
 }
 
 /*
@@ -148,70 +150,82 @@ void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, T *fwkerhalf, finufft_spread_opt
   f - funciton values at quadrature nodes multiplied with quadrature weights
   (a, f are provided as the inputs of onedim_fseries_kernel_compute() defined below)
 */
-template <typename T>
-void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, finufft_spread_opts opts) {
-    T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
-    // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-    int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
-    double z[2 * MAX_NQUAD];
-    double w[2 * MAX_NQUAD];
-
-    finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1)
-    for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
-        z[n] *= J2;                                         // rescale nodes
-        f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts);  // vals & quadr wei
-        a[n] = exp((T)(2.0 * M_PI) * std::complex<T>(0.0, 1.0) * (T)(nf / 2 - z[n]) / (T)nf); // phase winding rates
-    }
+template<typename T>
+void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+                                   finufft_spread_opts opts) {
+  T J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
+  // # quadr nodes in z (from 0 to J/2; reflections will be added)...
+  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
+  double z[2 * MAX_NQUAD];
+  double w[2 * MAX_NQUAD];
+
+  finufft::quadrature::legendre_compute_glr(2 * q, z, w); // only half the nodes used,
+                                                          // eg on (0,1)
+  for (int n = 0; n < q; ++n) {                           // set up nodes z_n and vals f_n
+    z[n] *= J2;                                           // rescale nodes
+    f[n] = J2 * w[n] * evaluate_kernel((T)z[n], opts);    // vals & quadr wei
+    a[n] = exp((T)(2.0 * M_PI) * std::complex<T>(0.0, 1.0) * (T)(nf / 2 - z[n]) /
+               (T)nf);                                    // phase winding rates
+  }
 }
 
-template <typename T>
-void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a, T *fwkerhalf,
-                                   finufft_spread_opts opts) {
-    T J2 = opts.nspread / 2.0;                         // J/2, half-width of ker z-support
-    int q = (int)(2 + 3.0 * J2);                       // not sure why so large? cannot exceed MAX_NQUAD
-    CUFINUFFT_BIGINT nout = nf / 2 + 1;                // how many values we're writing to
-    int nt = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks
-    std::vector<CUFINUFFT_BIGINT> brk(nt + 1);         // start indices for each thread
-    for (int t = 0; t <= nt; ++t)                      // split nout mode indices btw threads
-        brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt);
+template<typename T>
+void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<double> *a,
+                                   T *fwkerhalf, finufft_spread_opts opts) {
+  T J2  = opts.nspread / 2.0;         // J/2, half-width of ker z-support
+  int q = (int)(2 + 3.0 * J2);        // not sure why so large? cannot exceed MAX_NQUAD
+  CUFINUFFT_BIGINT nout = nf / 2 + 1; // how many values we're writing to
+  int nt                = std::min(nout, MY_OMP_GET_MAX_THREADS()); // how many chunks
+  std::vector<CUFINUFFT_BIGINT> brk(nt + 1); // start indices for each thread
+  for (int t = 0; t <= nt; ++t)              // split nout mode indices btw threads
+    brk[t] = (CUFINUFFT_BIGINT)(0.5 + nout * t / (double)nt);
 #pragma omp parallel
-    {
-        int t = MY_OMP_GET_THREAD_NUM();
-        if (t < nt) {                           // could be nt < actual # threads
-            std::complex<double> aj[MAX_NQUAD]; // phase rotator for this thread
-            for (int n = 0; n < q; ++n)
-                aj[n] = pow(a[n], (T)brk[t]);                        // init phase factors for chunk
-            for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
-                T x = 0.0;                                           // accumulator for answer at this j
-                for (int n = 0; n < q; ++n) {
-                    x += f[n] * 2 * real(aj[n]); // include the negative freq
-                    aj[n] *= a[n];               // wind the phases
-                }
-                fwkerhalf[j] = x;
-            }
+  {
+    int t = MY_OMP_GET_THREAD_NUM();
+    if (t < nt) {                         // could be nt < actual # threads
+      std::complex<double> aj[MAX_NQUAD]; // phase rotator for this thread
+      for (int n = 0; n < q; ++n)
+        aj[n] = pow(a[n], (T)brk[t]);     // init phase factors for chunk
+      for (CUFINUFFT_BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output
+                                                               // array
+        T x = 0.0;                     // accumulator for answer at this j
+        for (int n = 0; n < q; ++n) {
+          x += f[n] * 2 * real(aj[n]); // include the negative freq
+          aj[n] *= a[n];               // wind the phases
         }
+        fwkerhalf[j] = x;
+      }
     }
+  }
 }
 
-template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, std::complex<double> *a, float *fwkerhalf,
+template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f,
+                                            std::complex<double> *a, float *fwkerhalf,
                                             finufft_spread_opts opts);
-template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f, std::complex<double> *a, double *fwkerhalf,
+template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, double *f,
+                                            std::complex<double> *a, double *fwkerhalf,
                                             finufft_spread_opts opts);
 
-template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps, cufinufft_opts opts);
-template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps, cufinufft_opts opts);
-template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, float *f, std::complex<double> *a,
-                                            finufft_spread_opts opts);
-template void onedim_fseries_kernel_precomp(CUFINUFFT_BIGINT nf, double *f, std::complex<double> *a,
-                                            finufft_spread_opts opts);
-template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f, cuDoubleComplex *d_a,
-                                    float *d_fwkerhalf1, float *d_fwkerhalf2, float *d_fwkerhalf3, int ns,
+template int setup_spreader_for_nufft(finufft_spread_opts &spopts, float eps,
+                                      cufinufft_opts opts);
+template int setup_spreader_for_nufft(finufft_spread_opts &spopts, double eps,
+                                      cufinufft_opts opts);
+template void onedim_fseries_kernel_precomp(
+    CUFINUFFT_BIGINT nf, float *f, std::complex<double> *a, finufft_spread_opts opts);
+template void onedim_fseries_kernel_precomp(
+    CUFINUFFT_BIGINT nf, double *f, std::complex<double> *a, finufft_spread_opts opts);
+template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, float *d_f,
+                                    cuDoubleComplex *d_a, float *d_fwkerhalf1,
+                                    float *d_fwkerhalf2, float *d_fwkerhalf3, int ns,
                                     cudaStream_t stream);
-template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, double *d_f, cuDoubleComplex *d_a,
-                                    double *d_fwkerhalf1, double *d_fwkerhalf2, double *d_fwkerhalf3, int ns,
+template int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, double *d_f,
+                                    cuDoubleComplex *d_a, double *d_fwkerhalf1,
+                                    double *d_fwkerhalf2, double *d_fwkerhalf3, int ns,
                                     cudaStream_t stream);
 
-template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, finufft_spread_opts opts);
-template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, finufft_spread_opts opts);
+template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf,
+                                    finufft_spread_opts opts);
+template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf,
+                                    finufft_spread_opts opts);
 } // namespace common
 } // namespace cufinufft
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index 091158299..40510e95b 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -7,76 +7,75 @@
 #include <cufinufft/impl.h>
 
 inline bool is_invalid_mode_array(int dim, const int64_t *modes64, int32_t modes32[3]) {
-    int64_t tot_size = 1;
-    for (int i = 0; i < dim; ++i) {
-        if (modes64[i] > std::numeric_limits<int32_t>::max())
-            return true;
-        if (modes64[i] <= 0)
-            return true;
-        modes32[i] = modes64[i];
-        tot_size *= modes64[i];
-    }
-    for (int i = dim; i < 3; ++i)
-        modes32[i] = 1;
-
-    return tot_size > std::numeric_limits<int32_t>::max();
+  int64_t tot_size = 1;
+  for (int i = 0; i < dim; ++i) {
+    if (modes64[i] > std::numeric_limits<int32_t>::max()) return true;
+    if (modes64[i] <= 0) return true;
+    modes32[i] = modes64[i];
+    tot_size *= modes64[i];
+  }
+  for (int i = dim; i < 3; ++i) modes32[i] = 1;
+
+  return tot_size > std::numeric_limits<int32_t>::max();
 }
 
 extern "C" {
-int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, float tol,
-                        cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts) {
-    if (dim < 1 || dim > 3) {
-        fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
-        return FINUFFT_ERR_DIM_NOTVALID;
-    }
-
-    int nmodes32[3];
-    if (is_invalid_mode_array(dim, nmodes, nmodes32))
-        return FINUFFT_ERR_NDATA_NOTVALID;
-
-    return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t<float> **)d_plan_ptr,
-                                   opts);
+int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf,
+                        float tol, cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts) {
+  if (dim < 1 || dim > 3) {
+    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
+    return FINUFFT_ERR_DIM_NOTVALID;
+  }
+
+  int nmodes32[3];
+  if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID;
+
+  return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol,
+                                 (cufinufft_plan_t<float> **)d_plan_ptr, opts);
 }
 
-int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, double tol,
-                       cufinufft_plan *d_plan_ptr, cufinufft_opts *opts) {
-    if (dim < 1 || dim > 3) {
-        fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
-        return FINUFFT_ERR_DIM_NOTVALID;
-    }
+int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf,
+                       double tol, cufinufft_plan *d_plan_ptr, cufinufft_opts *opts) {
+  if (dim < 1 || dim > 3) {
+    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
+    return FINUFFT_ERR_DIM_NOTVALID;
+  }
 
-    int nmodes32[3];
-    if (is_invalid_mode_array(dim, nmodes, nmodes32))
-        return FINUFFT_ERR_NDATA_NOTVALID;
+  int nmodes32[3];
+  if (is_invalid_mode_array(dim, nmodes, nmodes32)) return FINUFFT_ERR_NDATA_NOTVALID;
 
-    return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t<double> **)d_plan_ptr,
-                                   opts);
+  return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol,
+                                 (cufinufft_plan_t<double> **)d_plan_ptr, opts);
 }
 
-int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z, int N, float *d_s,
-                      float *d_t, float *d_u) {
-    return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t<float> *)d_plan);
+int cufinufftf_setpts(cufinufftf_plan d_plan, int M, float *d_x, float *d_y, float *d_z,
+                      int N, float *d_s, float *d_t, float *d_u) {
+  return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u,
+                               (cufinufft_plan_t<float> *)d_plan);
 }
 
-int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z, int N, double *d_s,
-                     double *d_t, double *d_u) {
-    return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u, (cufinufft_plan_t<double> *)d_plan);
+int cufinufft_setpts(cufinufft_plan d_plan, int M, double *d_x, double *d_y, double *d_z,
+                     int N, double *d_s, double *d_t, double *d_u) {
+  return cufinufft_setpts_impl(M, d_x, d_y, d_z, N, d_s, d_t, d_u,
+                               (cufinufft_plan_t<double> *)d_plan);
 }
 
-int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c, cuFloatComplex *d_fk) {
-    return cufinufft_execute_impl<float>(d_c, d_fk, (cufinufft_plan_t<float> *)d_plan);
+int cufinufftf_execute(cufinufftf_plan d_plan, cuFloatComplex *d_c,
+                       cuFloatComplex *d_fk) {
+  return cufinufft_execute_impl<float>(d_c, d_fk, (cufinufft_plan_t<float> *)d_plan);
 }
 
-int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c, cuda_complex<double> *d_fk) {
-    return cufinufft_execute_impl<double>(d_c, d_fk, (cufinufft_plan_t<double> *)d_plan);
+int cufinufft_execute(cufinufft_plan d_plan, cuDoubleComplex *d_c,
+                      cuda_complex<double> *d_fk) {
+  return cufinufft_execute_impl<double>(d_c, d_fk, (cufinufft_plan_t<double> *)d_plan);
 }
 
 int cufinufftf_destroy(cufinufftf_plan d_plan) {
-    return cufinufft_destroy_impl<float>((cufinufft_plan_t<float> *)d_plan);
+  return cufinufft_destroy_impl<float>((cufinufft_plan_t<float> *)d_plan);
 }
 
 int cufinufft_destroy(cufinufft_plan d_plan) {
-    return cufinufft_destroy_impl<double>((cufinufft_plan_t<double> *)d_plan);
+  return cufinufft_destroy_impl<double>((cufinufft_plan_t<double> *)d_plan);
 }
 
 void cufinufft_default_opts(cufinufft_opts *opts)
@@ -96,32 +95,32 @@ void cufinufft_default_opts(cufinufft_opts *opts)
     Melody Shih 07/25/19; Barnett 2/5/21.
 */
 {
-    opts->upsampfac = 2.0;
+  opts->upsampfac = 2.0;
 
-    /* following options are for gpu */
-    opts->gpu_sort = 1; // access nupts in an ordered way for nupts driven method
+  /* following options are for gpu */
+  opts->gpu_sort = 1; // access nupts in an ordered way for nupts driven method
 
-    opts->gpu_maxsubprobsize = 1024;
-    opts->gpu_obinsizex = -1;
-    opts->gpu_obinsizey = -1;
-    opts->gpu_obinsizez = -1;
+  opts->gpu_maxsubprobsize = 1024;
+  opts->gpu_obinsizex      = -1;
+  opts->gpu_obinsizey      = -1;
+  opts->gpu_obinsizez      = -1;
 
-    opts->gpu_binsizex = -1;
-    opts->gpu_binsizey = -1;
-    opts->gpu_binsizez = -1;
+  opts->gpu_binsizex = -1;
+  opts->gpu_binsizey = -1;
+  opts->gpu_binsizez = -1;
 
-    opts->gpu_spreadinterponly = 0; // default to do the whole nufft
+  opts->gpu_spreadinterponly = 0; // default to do the whole nufft
 
-    opts->gpu_maxbatchsize = 0; // Heuristically set
-    opts->gpu_stream = cudaStreamDefault;
+  opts->gpu_maxbatchsize = 0;     // Heuristically set
+  opts->gpu_stream       = cudaStreamDefault;
 
-    opts->gpu_kerevalmeth = 1; // Horner
+  opts->gpu_kerevalmeth = 1; // Horner
 
-    opts->gpu_method = 0; // Auto method (2 for type 1, 2 for type 2).
+  opts->gpu_method = 0;      // Auto method (2 for type 1, 2 for type 2).
 
-    // By default, only use device 0
-    opts->gpu_device_id = 0;
+  // By default, only use device 0
+  opts->gpu_device_id = 0;
 
-    opts->modeord = 0;
+  opts->modeord = 0;
 }
 }
diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu
index 75d8adda3..94eb6b4c8 100644
--- a/src/cuda/deconvolve_wrapper.cu
+++ b/src/cuda/deconvolve_wrapper.cu
@@ -11,207 +11,218 @@ namespace cufinufft {
 namespace deconvolve {
 /* Kernel for copying fw to fk with amplication by prefac/ker */
 // Note: assume modeord=0: CMCL-compatible mode ordering in fk (from -N/2 up
-// to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up to -1).
-template <typename T, int modeord>
-__global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
-    int pivot1, w1, fwkerind1;
-    T kervalue;
-
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) {
-        if (modeord == 0) {
-            pivot1 = i - ms / 2;
-            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
-            fwkerind1 = abs(pivot1);
-        } else {
-            pivot1 = i - ms + ms / 2;
-            w1 = (pivot1 >= 0) ? nf1 + i - ms : i;
-            fwkerind1 = (pivot1 >= 0) ? ms - i : i;
-        }
-
-        kervalue = fwkerhalf1[fwkerind1];
-        fk[i].x = fw[w1].x / kervalue;
-        fk[i].y = fw[w1].y / kervalue;
+// to N/2-1), modeord=1: FFT-compatible mode ordering in fk (from 0 to N/2-1, then -N/2 up
+// to -1).
+template<typename T, int modeord>
+__global__ void deconvolve_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk,
+                              T *fwkerhalf1) {
+  int pivot1, w1, fwkerind1;
+  T kervalue;
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms;
+       i += blockDim.x * gridDim.x) {
+    if (modeord == 0) {
+      pivot1    = i - ms / 2;
+      w1        = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+      fwkerind1 = abs(pivot1);
+    } else {
+      pivot1    = i - ms + ms / 2;
+      w1        = (pivot1 >= 0) ? nf1 + i - ms : i;
+      fwkerind1 = (pivot1 >= 0) ? ms - i : i;
     }
+
+    kervalue = fwkerhalf1[fwkerind1];
+    fk[i].x  = fw[w1].x / kervalue;
+    fk[i].y  = fw[w1].y / kervalue;
+  }
 }
 
-template <typename T, int modeord>
-__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
-                              T *fwkerhalf2) {
-    int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2;
-    int k1, k2, inidx, outidx;
-    T kervalue;
-
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) {
-        k1 = i % ms;
-        k2 = i / ms;
-        outidx = k1 + k2 * ms;
-
-        if (modeord == 0) {
-            pivot1 = k1 - ms / 2;
-            pivot2 = k2 - mt / 2;
-            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
-            w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
-            fwkerind1 = abs(pivot1);
-            fwkerind2 = abs(pivot2);
-        } else {
-            pivot1 = k1 - ms + ms / 2;
-            pivot2 = k2 - mt + mt / 2;
-            w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
-            w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
-            fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
-            fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
-        }
-
-        inidx = w1 + w2 * nf1;
-        kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2];
-        fk[outidx].x = fw[inidx].x / kervalue;
-        fk[outidx].y = fw[inidx].y / kervalue;
+template<typename T, int modeord>
+__global__ void deconvolve_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw,
+                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2) {
+  int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2;
+  int k1, k2, inidx, outidx;
+  T kervalue;
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt;
+       i += blockDim.x * gridDim.x) {
+    k1     = i % ms;
+    k2     = i / ms;
+    outidx = k1 + k2 * ms;
+
+    if (modeord == 0) {
+      pivot1    = k1 - ms / 2;
+      pivot2    = k2 - mt / 2;
+      w1        = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+      w2        = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
+      fwkerind1 = abs(pivot1);
+      fwkerind2 = abs(pivot2);
+    } else {
+      pivot1    = k1 - ms + ms / 2;
+      pivot2    = k2 - mt + mt / 2;
+      w1        = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
+      w2        = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
+      fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
+      fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
     }
+
+    inidx        = w1 + w2 * nf1;
+    kervalue     = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2];
+    fk[outidx].x = fw[inidx].x / kervalue;
+    fk[outidx].y = fw[inidx].y / kervalue;
+  }
 }
 
-template <typename T, int modeord>
-__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw,
-                              cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
-    int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3;
-    int k1, k2, k3, inidx, outidx;
-    T kervalue;
-
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) {
-        k1 = i % ms;
-        k2 = (i / ms) % mt;
-        k3 = (i / ms / mt);
-        outidx = k1 + k2 * ms + k3 * ms * mt;
-
-        if (modeord == 0) {
-            pivot1 = k1 - ms / 2;
-            pivot2 = k2 - mt / 2;
-            pivot3 = k3 - mu / 2;
-            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
-            w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
-            w3 = (pivot3 >= 0) ? pivot3 : nf3 + pivot3;
-            fwkerind1 = abs(pivot1);
-            fwkerind2 = abs(pivot2);
-            fwkerind3 = abs(pivot3);
-        } else {
-            pivot1 = k1 - ms + ms / 2;
-            pivot2 = k2 - mt + mt / 2;
-            pivot3 = k3 - mu + mu / 2;
-            w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
-            w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
-            w3 = (pivot3 >= 0) ? nf3 + k3 - mu : k3;
-            fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
-            fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
-            fwkerind3 = (pivot3 >= 0) ? mu - k3 : k3;
-        }
-
-        inidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
-        kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2] * fwkerhalf3[fwkerind3];
-        fk[outidx].x = fw[inidx].x / kervalue;
-        fk[outidx].y = fw[inidx].y / kervalue;
+template<typename T, int modeord>
+__global__ void deconvolve_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
+                              cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
+                              T *fwkerhalf2, T *fwkerhalf3) {
+  int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3;
+  int k1, k2, k3, inidx, outidx;
+  T kervalue;
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu;
+       i += blockDim.x * gridDim.x) {
+    k1     = i % ms;
+    k2     = (i / ms) % mt;
+    k3     = (i / ms / mt);
+    outidx = k1 + k2 * ms + k3 * ms * mt;
+
+    if (modeord == 0) {
+      pivot1    = k1 - ms / 2;
+      pivot2    = k2 - mt / 2;
+      pivot3    = k3 - mu / 2;
+      w1        = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+      w2        = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
+      w3        = (pivot3 >= 0) ? pivot3 : nf3 + pivot3;
+      fwkerind1 = abs(pivot1);
+      fwkerind2 = abs(pivot2);
+      fwkerind3 = abs(pivot3);
+    } else {
+      pivot1    = k1 - ms + ms / 2;
+      pivot2    = k2 - mt + mt / 2;
+      pivot3    = k3 - mu + mu / 2;
+      w1        = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
+      w2        = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
+      w3        = (pivot3 >= 0) ? nf3 + k3 - mu : k3;
+      fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
+      fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
+      fwkerind3 = (pivot3 >= 0) ? mu - k3 : k3;
     }
+
+    inidx        = w1 + w2 * nf1 + w3 * nf1 * nf2;
+    kervalue     = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2] * fwkerhalf3[fwkerind3];
+    fk[outidx].x = fw[inidx].x / kervalue;
+    fk[outidx].y = fw[inidx].y / kervalue;
+  }
 }
 
 /* Kernel for copying fk to fw with same amplication */
-template <typename T, int modeord>
-__global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1) {
-    int pivot1, w1, fwkerind1;
-    T kervalue;
-
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms; i += blockDim.x * gridDim.x) {
-        if (modeord == 0) {
-            pivot1 = i - ms / 2;
-            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
-            fwkerind1 = abs(pivot1);
-        } else {
-            pivot1 = i - ms + ms / 2;
-            w1 = (pivot1 >= 0) ? nf1 + i - ms : i;
-            fwkerind1 = (pivot1 >= 0) ? ms - i : i;
-        }
-
-        kervalue = fwkerhalf1[fwkerind1];
-        fw[w1].x = fk[i].x / kervalue;
-        fw[w1].y = fk[i].y / kervalue;
+template<typename T, int modeord>
+__global__ void amplify_1d(int ms, int nf1, cuda_complex<T> *fw, cuda_complex<T> *fk,
+                           T *fwkerhalf1) {
+  int pivot1, w1, fwkerind1;
+  T kervalue;
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms;
+       i += blockDim.x * gridDim.x) {
+    if (modeord == 0) {
+      pivot1    = i - ms / 2;
+      w1        = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+      fwkerind1 = abs(pivot1);
+    } else {
+      pivot1    = i - ms + ms / 2;
+      w1        = (pivot1 >= 0) ? nf1 + i - ms : i;
+      fwkerind1 = (pivot1 >= 0) ? ms - i : i;
     }
+
+    kervalue = fwkerhalf1[fwkerind1];
+    fw[w1].x = fk[i].x / kervalue;
+    fw[w1].y = fk[i].y / kervalue;
+  }
 }
 
-template <typename T, int modeord>
-__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
-                           T *fwkerhalf2) {
-    int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2;
-    int k1, k2, inidx, outidx;
-    T kervalue;
-
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt; i += blockDim.x * gridDim.x) {
-        k1 = i % ms;
-        k2 = i / ms;
-        inidx = k1 + k2 * ms;
-
-        if (modeord == 0) {
-            pivot1 = k1 - ms / 2;
-            pivot2 = k2 - mt / 2;
-            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
-            w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
-            fwkerind1 = abs(pivot1);
-            fwkerind2 = abs(pivot2);
-        } else {
-            pivot1 = k1 - ms + ms / 2;
-            pivot2 = k2 - mt + mt / 2;
-            w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
-            w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
-            fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
-            fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
-        }
-
-        outidx = w1 + w2 * nf1;
-        kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2];
-        fw[outidx].x = fk[inidx].x / kervalue;
-        fw[outidx].y = fk[inidx].y / kervalue;
+template<typename T, int modeord>
+__global__ void amplify_2d(int ms, int mt, int nf1, int nf2, cuda_complex<T> *fw,
+                           cuda_complex<T> *fk, T *fwkerhalf1, T *fwkerhalf2) {
+  int pivot1, pivot2, w1, w2, fwkerind1, fwkerind2;
+  int k1, k2, inidx, outidx;
+  T kervalue;
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt;
+       i += blockDim.x * gridDim.x) {
+    k1    = i % ms;
+    k2    = i / ms;
+    inidx = k1 + k2 * ms;
+
+    if (modeord == 0) {
+      pivot1    = k1 - ms / 2;
+      pivot2    = k2 - mt / 2;
+      w1        = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+      w2        = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
+      fwkerind1 = abs(pivot1);
+      fwkerind2 = abs(pivot2);
+    } else {
+      pivot1    = k1 - ms + ms / 2;
+      pivot2    = k2 - mt + mt / 2;
+      w1        = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
+      w2        = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
+      fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
+      fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
     }
+
+    outidx       = w1 + w2 * nf1;
+    kervalue     = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2];
+    fw[outidx].x = fk[inidx].x / kervalue;
+    fw[outidx].y = fk[inidx].y / kervalue;
+  }
 }
 
-template <typename T, int modeord>
-__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3, cuda_complex<T> *fw, cuda_complex<T> *fk,
-                           T *fwkerhalf1, T *fwkerhalf2, T *fwkerhalf3) {
-    int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3;
-    int k1, k2, k3, inidx, outidx;
-    T kervalue;
-
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu; i += blockDim.x * gridDim.x) {
-        k1 = i % ms;
-        k2 = (i / ms) % mt;
-        k3 = (i / ms / mt);
-        inidx = k1 + k2 * ms + k3 * ms * mt;
-
-        if (modeord == 0) {
-            pivot1 = k1 - ms / 2;
-            pivot2 = k2 - mt / 2;
-            pivot3 = k3 - mu / 2;
-            w1 = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
-            w2 = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
-            w3 = (pivot3 >= 0) ? pivot3 : nf3 + pivot3;
-            fwkerind1 = abs(pivot1);
-            fwkerind2 = abs(pivot2);
-            fwkerind3 = abs(pivot3);
-        } else {
-            pivot1 = k1 - ms + ms / 2;
-            pivot2 = k2 - mt + mt / 2;
-            pivot3 = k3 - mu + mu / 2;
-            w1 = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
-            w2 = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
-            w3 = (pivot3 >= 0) ? nf3 + k3 - mu : k3;
-            fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
-            fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
-            fwkerind3 = (pivot3 >= 0) ? mu - k3 : k3;
-        }
-
-        outidx = w1 + w2 * nf1 + w3 * nf1 * nf2;
-        kervalue = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2] * fwkerhalf3[fwkerind3];
-        fw[outidx].x = fk[inidx].x / kervalue;
-        fw[outidx].y = fk[inidx].y / kervalue;
+template<typename T, int modeord>
+__global__ void amplify_3d(int ms, int mt, int mu, int nf1, int nf2, int nf3,
+                           cuda_complex<T> *fw, cuda_complex<T> *fk, T *fwkerhalf1,
+                           T *fwkerhalf2, T *fwkerhalf3) {
+  int pivot1, pivot2, pivot3, w1, w2, w3, fwkerind1, fwkerind2, fwkerind3;
+  int k1, k2, k3, inidx, outidx;
+  T kervalue;
+
+  for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < ms * mt * mu;
+       i += blockDim.x * gridDim.x) {
+    k1    = i % ms;
+    k2    = (i / ms) % mt;
+    k3    = (i / ms / mt);
+    inidx = k1 + k2 * ms + k3 * ms * mt;
+
+    if (modeord == 0) {
+      pivot1    = k1 - ms / 2;
+      pivot2    = k2 - mt / 2;
+      pivot3    = k3 - mu / 2;
+      w1        = (pivot1 >= 0) ? pivot1 : nf1 + pivot1;
+      w2        = (pivot2 >= 0) ? pivot2 : nf2 + pivot2;
+      w3        = (pivot3 >= 0) ? pivot3 : nf3 + pivot3;
+      fwkerind1 = abs(pivot1);
+      fwkerind2 = abs(pivot2);
+      fwkerind3 = abs(pivot3);
+    } else {
+      pivot1    = k1 - ms + ms / 2;
+      pivot2    = k2 - mt + mt / 2;
+      pivot3    = k3 - mu + mu / 2;
+      w1        = (pivot1 >= 0) ? nf1 + k1 - ms : k1;
+      w2        = (pivot2 >= 0) ? nf2 + k2 - mt : k2;
+      w3        = (pivot3 >= 0) ? nf3 + k3 - mu : k3;
+      fwkerind1 = (pivot1 >= 0) ? ms - k1 : k1;
+      fwkerind2 = (pivot2 >= 0) ? mt - k2 : k2;
+      fwkerind3 = (pivot3 >= 0) ? mu - k3 : k3;
     }
+
+    outidx       = w1 + w2 * nf1 + w3 * nf1 * nf2;
+    kervalue     = fwkerhalf1[fwkerind1] * fwkerhalf2[fwkerind2] * fwkerhalf3[fwkerind3];
+    fw[outidx].x = fk[inidx].x / kervalue;
+    fw[outidx].y = fk[inidx].y / kervalue;
+  }
 }
 
-template <typename T, int modeord>
+template<typename T, int modeord>
 int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 1D.
@@ -219,29 +230,30 @@ int cudeconvolve1d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 11/21/21
 */
 {
-    auto &stream = d_plan->stream;
-
-    int ms = d_plan->ms;
-    int nf1 = d_plan->nf1;
-    int nmodes = ms;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    if (d_plan->spopts.spread_direction == 1) {
-        for (int t = 0; t < blksize; t++) {
-            deconvolve_1d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
-                                                                        d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
-        }
-    } else {
-        checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream));
-        for (int t = 0; t < blksize; t++) {
-            amplify_1d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, nf1, d_plan->fw + t * nf1,
-                                                                     d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
-        }
+  auto &stream = d_plan->stream;
+
+  int ms           = d_plan->ms;
+  int nf1          = d_plan->nf1;
+  int nmodes       = ms;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  if (d_plan->spopts.spread_direction == 1) {
+    for (int t = 0; t < blksize; t++) {
+      deconvolve_1d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, nf1, d_plan->fw + t * nf1, d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
     }
-    return 0;
+  } else {
+    checkCudaErrors(cudaMemsetAsync(
+        d_plan->fw, 0, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream));
+    for (int t = 0; t < blksize; t++) {
+      amplify_1d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, nf1, d_plan->fw + t * nf1, d_plan->fk + t * nmodes, d_plan->fwkerhalf1);
+    }
+  }
+  return 0;
 }
 
-template <typename T, int modeord>
+template<typename T, int modeord>
 int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 2D.
@@ -249,33 +261,34 @@ int cudeconvolve2d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    auto &stream = d_plan->stream;
-
-    int ms = d_plan->ms;
-    int mt = d_plan->mt;
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nmodes = ms * mt;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    if (d_plan->spopts.spread_direction == 1) {
-        for (int t = 0; t < blksize; t++) {
-            deconvolve_2d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
-                                                                        d_plan->fk + t * nmodes, d_plan->fwkerhalf1,
-                                                                        d_plan->fwkerhalf2);
-        }
-    } else {
-        checkCudaErrors(cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream));
-        for (int t = 0; t < blksize; t++) {
-            amplify_2d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2,
-                                                                     d_plan->fk + t * nmodes, d_plan->fwkerhalf1,
-                                                                     d_plan->fwkerhalf2);
-        }
+  auto &stream = d_plan->stream;
+
+  int ms           = d_plan->ms;
+  int mt           = d_plan->mt;
+  int nf1          = d_plan->nf1;
+  int nf2          = d_plan->nf2;
+  int nmodes       = ms * mt;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  if (d_plan->spopts.spread_direction == 1) {
+    for (int t = 0; t < blksize; t++) {
+      deconvolve_2d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes,
+          d_plan->fwkerhalf1, d_plan->fwkerhalf2);
+    }
+  } else {
+    checkCudaErrors(cudaMemsetAsync(
+        d_plan->fw, 0, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream));
+    for (int t = 0; t < blksize; t++) {
+      amplify_2d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, mt, nf1, nf2, d_plan->fw + t * nf1 * nf2, d_plan->fk + t * nmodes,
+          d_plan->fwkerhalf1, d_plan->fwkerhalf2);
     }
-    return 0;
+  }
+  return 0;
 }
 
-template <typename T, int modeord>
+template<typename T, int modeord>
 int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
     wrapper for deconvolution & amplication in 3D.
@@ -283,44 +296,46 @@ int cudeconvolve3d(cufinufft_plan_t<T> *d_plan, int blksize)
     Melody Shih 07/25/19
 */
 {
-    auto &stream = d_plan->stream;
-
-    int ms = d_plan->ms;
-    int mt = d_plan->mt;
-    int mu = d_plan->mu;
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int nmodes = ms * mt * mu;
-    int maxbatchsize = d_plan->maxbatchsize;
-    if (d_plan->spopts.spread_direction == 1) {
-        for (int t = 0; t < blksize; t++) {
-            deconvolve_3d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
-                ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
-                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3);
-        }
-    } else {
-        checkCudaErrors(
-            cudaMemsetAsync(d_plan->fw, 0, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>), stream));
-        for (int t = 0; t < blksize; t++) {
-            amplify_3d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
-                ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3, d_plan->fk + t * nmodes,
-                d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3);
-        }
+  auto &stream = d_plan->stream;
+
+  int ms           = d_plan->ms;
+  int mt           = d_plan->mt;
+  int mu           = d_plan->mu;
+  int nf1          = d_plan->nf1;
+  int nf2          = d_plan->nf2;
+  int nf3          = d_plan->nf3;
+  int nmodes       = ms * mt * mu;
+  int maxbatchsize = d_plan->maxbatchsize;
+  if (d_plan->spopts.spread_direction == 1) {
+    for (int t = 0; t < blksize; t++) {
+      deconvolve_3d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3,
+          d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
+          d_plan->fwkerhalf3);
+    }
+  } else {
+    checkCudaErrors(cudaMemsetAsync(
+        d_plan->fw, 0, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>), stream));
+    for (int t = 0; t < blksize; t++) {
+      amplify_3d<T, modeord><<<(nmodes + 256 - 1) / 256, 256, 0, stream>>>(
+          ms, mt, mu, nf1, nf2, nf3, d_plan->fw + t * nf1 * nf2 * nf3,
+          d_plan->fk + t * nmodes, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
+          d_plan->fwkerhalf3);
     }
-    return 0;
+  }
+  return 0;
 }
 
-template int cudeconvolve1d<float,  0>(cufinufft_plan_t<float>  *d_plan, int blksize);
-template int cudeconvolve1d<float,  1>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve1d<float, 0>(cufinufft_plan_t<float> *d_plan, int blksize);
+template int cudeconvolve1d<float, 1>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cudeconvolve1d<double, 0>(cufinufft_plan_t<double> *d_plan, int blksize);
 template int cudeconvolve1d<double, 1>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cudeconvolve2d<float,  0>(cufinufft_plan_t<float>  *d_plan, int blksize);
-template int cudeconvolve2d<float,  1>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve2d<float, 0>(cufinufft_plan_t<float> *d_plan, int blksize);
+template int cudeconvolve2d<float, 1>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cudeconvolve2d<double, 0>(cufinufft_plan_t<double> *d_plan, int blksize);
 template int cudeconvolve2d<double, 1>(cufinufft_plan_t<double> *d_plan, int blksize);
-template int cudeconvolve3d<float,  0>(cufinufft_plan_t<float>  *d_plan, int blksize);
-template int cudeconvolve3d<float,  1>(cufinufft_plan_t<float>  *d_plan, int blksize);
+template int cudeconvolve3d<float, 0>(cufinufft_plan_t<float> *d_plan, int blksize);
+template int cudeconvolve3d<float, 1>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cudeconvolve3d<double, 0>(cufinufft_plan_t<double> *d_plan, int blksize);
 template int cudeconvolve3d<double, 1>(cufinufft_plan_t<double> *d_plan, int blksize);
 
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index a00fa526e..ea2170b9b 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -11,7 +11,7 @@
 namespace cufinufft {
 namespace memtransfer {
 
-template <typename T>
+template<typename T>
 int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "plan" stage.
@@ -19,53 +19,60 @@ int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 11/21/21
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-
-    int ier;
-    int nf1 = d_plan->nf1;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-            if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream))))
-                goto finalize;
-            if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream))))
-                goto finalize;
-        }
-    } break;
-    case 2: {
-        int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numbins * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (numbins + 1) * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "err: invalid method " << std::endl;
-    }
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+
+  int ier;
+  int nf1          = d_plan->nf1;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort) {
+      int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
+      if ((ier = checkCudaErrors(
+               cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream))))
+        goto finalize;
+      if ((ier = checkCudaErrors(
+               cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream))))
+        goto finalize;
     }
+  } break;
+  case 2: {
+    int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->numsubprob, numbins * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binsize, numbins * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binstartpts, numbins * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->subprobstartpts, (numbins + 1) * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "err: invalid method " << std::endl;
+  }
+
+  if (!d_plan->opts.gpu_spreadinterponly) {
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->fw, maxbatchsize * nf1 * sizeof(cuda_complex<T>), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "setNUpts" stage.
@@ -73,41 +80,43 @@ int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 11/21/21
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-
-    int M = d_plan->M;
-    CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
-    CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort &&
-            (ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 2: {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem1d_nupts] error: invalid method\n";
-        ier = FINUFFT_ERR_METHOD_NOTVALID;
-    }
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+
+  int M = d_plan->M;
+  CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
+  CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort && (ier = checkCudaErrors(cudaMallocAsync(
+                                      &d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 2: {
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem1d_nupts] error: invalid method\n";
+    ier = FINUFFT_ERR_METHOD_NOTVALID;
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "plan" stage.
@@ -115,66 +124,70 @@ int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            int numbins[2];
-            numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-            numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
-            if ((ier =
-                     checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream))))
-                goto finalize;
-            if ((ier = checkCudaErrors(
-                     cudaMallocAsync(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream))))
-                goto finalize;
-        }
-    } break;
-    case 2: {
-        int64_t numbins[2];
-        numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-        numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
-        if ((ier =
-                 checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numbins[0] * numbins[1] * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream))))
-            goto finalize;
-        if ((ier =
-                 checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(
-                 cudaMallocAsync(&d_plan->subprobstartpts, (numbins[0] * numbins[1] + 1) * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem2d_plan] error: invalid method\n";
-    }
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        if ((ier = checkCudaErrors(
-                 cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+
+  int nf1          = d_plan->nf1;
+  int nf2          = d_plan->nf2;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort) {
+      int numbins[2];
+      numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
+      numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
+      if ((ier = checkCudaErrors(cudaMallocAsync(
+               &d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream))))
+        goto finalize;
+      if ((ier = checkCudaErrors(cudaMallocAsync(
+               &d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream))))
+        goto finalize;
     }
+  } break;
+  case 2: {
+    int64_t numbins[2];
+    numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
+    numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->numsubprob, numbins[0] * numbins[1] * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->binsize, numbins[0] * numbins[1] * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts,
+                                             (numbins[0] * numbins[1] + 1) * sizeof(int),
+                                             stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem2d_plan] error: invalid method\n";
+  }
+
+  if (!d_plan->opts.gpu_spreadinterponly) {
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->fw, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "setNUpts" stage.
@@ -182,41 +195,43 @@ int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-
-    const int M = d_plan->M;
-
-    CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
-    CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort &&
-            (ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 2: {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem2d_nupts] error: invalid method\n";
-    }
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+
+  const int M = d_plan->M;
+
+  CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
+  CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort && (ier = checkCudaErrors(cudaMallocAsync(
+                                      &d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 2: {
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem2d_nupts] error: invalid method\n";
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "plan" stage.
@@ -224,89 +239,104 @@ int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-
-    int nf1 = d_plan->nf1;
-    int nf2 = d_plan->nf2;
-    int nf3 = d_plan->nf3;
-    int maxbatchsize = d_plan->maxbatchsize;
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) *
-                                      ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
-                                      ceil((T)nf3 / d_plan->opts.gpu_binsizez);
-            if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream))))
-                goto finalize;
-            if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream))))
-                goto finalize;
-        }
-    } break;
-    case 2: {
-        const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) * ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
-                                  ceil((T)nf3 / d_plan->opts.gpu_binsizez);
-
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, nbins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (nbins_tot + 1) * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 4: {
-        const int numobins[3] = {(int)ceil((T)nf1 / d_plan->opts.gpu_obinsizex),
-                                 (int)ceil((T)nf2 / d_plan->opts.gpu_obinsizey),
-                                 (int)ceil((T)nf3 / d_plan->opts.gpu_obinsizez)};
-
-        const int binsperobins[3] = {d_plan->opts.gpu_obinsizex / d_plan->opts.gpu_binsizex,
-                                     d_plan->opts.gpu_obinsizey / d_plan->opts.gpu_binsizey,
-                                     d_plan->opts.gpu_obinsizez / d_plan->opts.gpu_binsizez};
-
-        const int numbins[3] = {numobins[0] * (binsperobins[0] + 2), numobins[1] * (binsperobins[1] + 2),
-                                numobins[2] * (binsperobins[2] + 2)};
-
-        const int64_t numobins_tot = numobins[0] * numobins[1] * numobins[2];
-        const int64_t numbins_tot = numbins[0] * numbins[1] * numbins[2];
-
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->numsubprob, numobins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binsize, numbins_tot * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->binstartpts, (numbins_tot + 1) * sizeof(int), stream))))
-            goto finalize;
-        if ((ier =
-                 checkCudaErrors(cudaMallocAsync(&d_plan->subprobstartpts, (numobins_tot + 1) * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem3d_plan] error: invalid method\n";
-    }
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        if ((ier = checkCudaErrors(
-                 cudaMallocAsync(&d_plan->fw, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->fwkerhalf3, (nf3 / 2 + 1) * sizeof(T), stream))))
-            goto finalize;
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+
+  int nf1          = d_plan->nf1;
+  int nf2          = d_plan->nf2;
+  int nf3          = d_plan->nf3;
+  int maxbatchsize = d_plan->maxbatchsize;
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort) {
+      const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) *
+                                ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
+                                ceil((T)nf3 / d_plan->opts.gpu_binsizez);
+      if ((ier = checkCudaErrors(
+               cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream))))
+        goto finalize;
+      if ((ier = checkCudaErrors(
+               cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream))))
+        goto finalize;
     }
+  } break;
+  case 2: {
+    const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) *
+                              ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
+                              ceil((T)nf3 / d_plan->opts.gpu_binsizez);
+
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->numsubprob, nbins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binsize, nbins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binstartpts, nbins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->subprobstartpts, (nbins_tot + 1) * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 4: {
+    const int numobins[3] = {(int)ceil((T)nf1 / d_plan->opts.gpu_obinsizex),
+                             (int)ceil((T)nf2 / d_plan->opts.gpu_obinsizey),
+                             (int)ceil((T)nf3 / d_plan->opts.gpu_obinsizez)};
+
+    const int binsperobins[3] = {d_plan->opts.gpu_obinsizex / d_plan->opts.gpu_binsizex,
+                                 d_plan->opts.gpu_obinsizey / d_plan->opts.gpu_binsizey,
+                                 d_plan->opts.gpu_obinsizez / d_plan->opts.gpu_binsizez};
+
+    const int numbins[3] = {numobins[0] * (binsperobins[0] + 2),
+                            numobins[1] * (binsperobins[1] + 2),
+                            numobins[2] * (binsperobins[2] + 2)};
+
+    const int64_t numobins_tot = numobins[0] * numobins[1] * numobins[2];
+    const int64_t numbins_tot  = numbins[0] * numbins[1] * numbins[2];
+
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->numsubprob, numobins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->binsize, numbins_tot * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->binstartpts, (numbins_tot + 1) * sizeof(int), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(cudaMallocAsync(
+             &d_plan->subprobstartpts, (numobins_tot + 1) * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem3d_plan] error: invalid method\n";
+  }
+
+  if (!d_plan->opts.gpu_spreadinterponly) {
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fw,
+                             maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>),
+                             stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->fwkerhalf3, (nf3 / 2 + 1) * sizeof(T), stream))))
+      goto finalize;
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for gpu memory allocation in "setNUpts" stage.
@@ -314,44 +344,47 @@ int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-    int ier;
-    int M = d_plan->M;
-
-    CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
-    CUDA_FREE_AND_NULL(d_plan->idxnupts, stream)
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort &&
-            ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream)))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 2: {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
-            goto finalize;
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    case 4: {
-        if ((ier = checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
-            goto finalize;
-    } break;
-    default:
-        std::cerr << "[allocgpumem3d_nupts] error: invalid method\n";
-    }
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+  int ier;
+  int M = d_plan->M;
+
+  CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
+  CUDA_FREE_AND_NULL(d_plan->idxnupts, stream)
+
+  switch (d_plan->opts.gpu_method) {
+  case 1: {
+    if (d_plan->opts.gpu_sort && ((ier = checkCudaErrors(cudaMallocAsync(
+                                       &d_plan->sortidx, M * sizeof(int), stream)))))
+      goto finalize;
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 2: {
+    if ((ier = checkCudaErrors(
+             cudaMallocAsync(&d_plan->idxnupts, M * sizeof(int), stream))))
+      goto finalize;
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  case 4: {
+    if ((ier =
+             checkCudaErrors(cudaMallocAsync(&d_plan->sortidx, M * sizeof(int), stream))))
+      goto finalize;
+  } break;
+  default:
+    std::cerr << "[allocgpumem3d_nupts] error: invalid method\n";
+  }
 
 finalize:
-    if (ier)
-        freegpumemory(d_plan);
+  if (ier) freegpumemory(d_plan);
 
-    return ier;
+  return ier;
 }
 
-template <typename T>
+template<typename T>
 void freegpumemory(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for freeing gpu memory.
@@ -359,24 +392,24 @@ void freegpumemory(cufinufft_plan_t<T> *d_plan)
     Melody Shih 11/21/21
 */
 {
-    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
-    auto &stream = d_plan->stream;
-
-    CUDA_FREE_AND_NULL(d_plan->fw, stream);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream);
-    CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream);
-
-    CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
-    CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
-    CUDA_FREE_AND_NULL(d_plan->numsubprob, stream);
-    CUDA_FREE_AND_NULL(d_plan->binsize, stream);
-    CUDA_FREE_AND_NULL(d_plan->binstartpts, stream);
-    CUDA_FREE_AND_NULL(d_plan->subprob_to_bin, stream);
-    CUDA_FREE_AND_NULL(d_plan->subprobstartpts, stream);
-
-    CUDA_FREE_AND_NULL(d_plan->numnupts, stream);
-    CUDA_FREE_AND_NULL(d_plan->numsubprob, stream);
+  utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+  auto &stream = d_plan->stream;
+
+  CUDA_FREE_AND_NULL(d_plan->fw, stream);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf1, stream);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf2, stream);
+  CUDA_FREE_AND_NULL(d_plan->fwkerhalf3, stream);
+
+  CUDA_FREE_AND_NULL(d_plan->idxnupts, stream);
+  CUDA_FREE_AND_NULL(d_plan->sortidx, stream);
+  CUDA_FREE_AND_NULL(d_plan->numsubprob, stream);
+  CUDA_FREE_AND_NULL(d_plan->binsize, stream);
+  CUDA_FREE_AND_NULL(d_plan->binstartpts, stream);
+  CUDA_FREE_AND_NULL(d_plan->subprob_to_bin, stream);
+  CUDA_FREE_AND_NULL(d_plan->subprobstartpts, stream);
+
+  CUDA_FREE_AND_NULL(d_plan->numnupts, stream);
+  CUDA_FREE_AND_NULL(d_plan->numsubprob, stream);
 }
 
 template int allocgpumem1d_plan<float>(cufinufft_plan_t<float> *d_plan);
diff --git a/src/cuda/precision_independent.cu b/src/cuda/precision_independent.cu
index 1ab2865e0..66cc5ca69 100644
--- a/src/cuda/precision_independent.cu
+++ b/src/cuda/precision_independent.cu
@@ -18,216 +18,237 @@ __device__ RT carg(const CT &z) { return (RT)atan2(ipart(z), rpart(z)); } // pol
 __device__ RT cabs(const CT &z) { return (RT)cuCabs(z); }
 
 /* Common Kernels from spreadinterp3d */
-__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony, int onz, int bnx, int bny,
-                                          int bnz) {
-    int oix, oiy, oiz;
-    oix = xidx / bnx;
-    oiy = yidx / bny;
-    oiz = zidx / bnz;
-    return (oix + oiy * onx + oiz * ony * onx) * (bnx * bny * bnz) +
-           (xidx % bnx + yidx % bny * bnx + zidx % bnz * bny * bnx);
+__host__ __device__ int calc_global_index(int xidx, int yidx, int zidx, int onx, int ony,
+                                          int onz, int bnx, int bny, int bnz) {
+  int oix, oiy, oiz;
+  oix = xidx / bnx;
+  oiy = yidx / bny;
+  oiz = zidx / bnz;
+  return (oix + oiy * onx + oiz * ony * onx) * (bnx * bny * bnz) +
+         (xidx % bnx + yidx % bny * bnx + zidx % bnz * bny * bnx);
 }
 
-__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny, int nbinz) {
-    return xidx + yidx * nbinx + zidx * nbinx * nbiny;
+__device__ int calc_global_index_v2(int xidx, int yidx, int zidx, int nbinx, int nbiny,
+                                    int nbinz) {
+  return xidx + yidx * nbinx + zidx * nbinx * nbiny;
 }
 
 /* spreadinterp 1d */
-__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
-    }
+__global__ void calc_subprob_1d(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
+  }
 }
 
-__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        for (int j = 0; j < d_numsubprob[i]; j++) {
-            d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
-        }
+__global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                      int *d_numsubprob, int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    for (int j = 0; j < d_numsubprob[i]; j++) {
+      d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
     }
+  }
 }
 
 __global__ void trivial_global_sort_index_1d(int M, int *index) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        index[i] = i;
-    }
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    index[i] = i;
+  }
 }
 
 /* spreadinterp 2d */
-__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
-    }
+__global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
+  }
 }
 
-__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        for (int j = 0; j < d_numsubprob[i]; j++) {
-            d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
-        }
+__global__ void map_b_into_subprob_2d(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                      int *d_numsubprob, int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    for (int j = 0; j < d_numsubprob[i]; j++) {
+      d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
     }
+  }
 }
 
 __global__ void trivial_global_sort_index_2d(int M, int *index) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        index[i] = i;
-    }
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    index[i] = i;
+  }
 }
 
 /* spreadinterp3d */
-__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
-    }
+__global__ void calc_subprob_3d_v2(int *bin_size, int *num_subprob, int maxsubprobsize,
+                                   int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    num_subprob[i] = ceil(bin_size[i] / (float)maxsubprobsize);
+  }
 }
 
-__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts, int *d_numsubprob,
-                                         int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        for (int j = 0; j < d_numsubprob[i]; j++) {
-            d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
-        }
+__global__ void map_b_into_subprob_3d_v2(int *d_subprob_to_bin, int *d_subprobstartpts,
+                                         int *d_numsubprob, int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    for (int j = 0; j < d_numsubprob[i]; j++) {
+      d_subprob_to_bin[d_subprobstartpts[i] + j] = i;
     }
+  }
 }
 
-__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size,
-                                   int *num_subprob, int maxsubprobsize, int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        int numnupts = 0;
-        int binsperobin = binsperobinx * binsperobiny * binsperobinz;
-        for (int b = 0; b < binsperobin; b++) {
-            numnupts += bin_size[binsperobin * i + b];
-        }
-        num_subprob[i] = ceil(numnupts / (float)maxsubprobsize);
+__global__ void calc_subprob_3d_v1(int binsperobinx, int binsperobiny, int binsperobinz,
+                                   int *bin_size, int *num_subprob, int maxsubprobsize,
+                                   int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    int numnupts    = 0;
+    int binsperobin = binsperobinx * binsperobiny * binsperobinz;
+    for (int b = 0; b < binsperobin; b++) {
+      numnupts += bin_size[binsperobin * i + b];
     }
+    num_subprob[i] = ceil(numnupts / (float)maxsubprobsize);
+  }
 }
 
-__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts, int *d_numsubprob,
-                                         int numbins) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins; i += gridDim.x * blockDim.x) {
-        for (int j = 0; j < d_numsubprob[i]; j++) {
-            d_subprob_to_obin[d_subprobstartpts[i] + j] = i;
-        }
+__global__ void map_b_into_subprob_3d_v1(int *d_subprob_to_obin, int *d_subprobstartpts,
+                                         int *d_numsubprob, int numbins) {
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < numbins;
+       i += gridDim.x * blockDim.x) {
+    for (int j = 0; j < d_numsubprob[i]; j++) {
+      d_subprob_to_obin[d_subprobstartpts[i] + j] = i;
     }
+  }
 }
 
 __global__ void trivial_global_sort_index_3d(int M, int *index) {
-    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) {
-        index[i] = i;
-    }
+  for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M;
+       i += gridDim.x * blockDim.x) {
+    index[i] = i;
+  }
 }
 
-__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny,
-                                int nobinz, int *binsize) {
-    int binx = threadIdx.x + blockIdx.x * blockDim.x;
-    int biny = threadIdx.y + blockIdx.y * blockDim.y;
-    int binz = threadIdx.z + blockIdx.z * blockDim.z;
-
-    int nbinx = nobinx * binsperobinx;
-    int nbiny = nobiny * binsperobiny;
-    int nbinz = nobinz * binsperobinz;
-
-    if (binx < nbinx && biny < nbiny && binz < nbinz) {
-        int binidx =
-            calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz);
-        int i, j, k;
-        i = binx;
-        j = biny;
-        k = binz;
-        if (binx % binsperobinx == 0) {
-            i = binx - 2;
-            i = i < 0 ? i + nbinx : i;
-        }
-        if (binx % binsperobinx == binsperobinx - 1) {
-            i = binx + 2;
-            i = (i >= nbinx) ? i - nbinx : i;
-        }
-        if (biny % binsperobiny == 0) {
-            j = biny - 2;
-            j = j < 0 ? j + nbiny : j;
-        }
-        if (biny % binsperobiny == binsperobiny - 1) {
-            j = biny + 2;
-            j = (j >= nbiny) ? j - nbiny : j;
-        }
-        if (binz % binsperobinz == 0) {
-            k = binz - 2;
-            k = k < 0 ? k + nbinz : k;
-        }
-        if (binz % binsperobinz == binsperobinz - 1) {
-            k = binz + 2;
-            k = (k >= nbinz) ? k - nbinz : k;
-        }
-        int idxtoupdate = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz);
-        if (idxtoupdate != binidx) {
-            binsize[binidx] = binsize[idxtoupdate];
-        }
-    }
-}
+__global__ void fill_ghost_bins(int binsperobinx, int binsperobiny, int binsperobinz,
+                                int nobinx, int nobiny, int nobinz, int *binsize) {
+  int binx = threadIdx.x + blockIdx.x * blockDim.x;
+  int biny = threadIdx.y + blockIdx.y * blockDim.y;
+  int binz = threadIdx.z + blockIdx.z * blockDim.z;
 
-__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz, int nobinx, int nobiny,
-                                    int nobinz, int *binsize, int *index, int *binstartpts, int M) {
-    int binx = threadIdx.x + blockIdx.x * blockDim.x;
-    int biny = threadIdx.y + blockIdx.y * blockDim.y;
-    int binz = threadIdx.z + blockIdx.z * blockDim.z;
-    int nbinx = nobinx * binsperobinx;
-    int nbiny = nobiny * binsperobiny;
-    int nbinz = nobinz * binsperobinz;
+  int nbinx = nobinx * binsperobinx;
+  int nbiny = nobiny * binsperobiny;
+  int nbinz = nobinz * binsperobinz;
 
+  if (binx < nbinx && biny < nbiny && binz < nbinz) {
+    int binidx = calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx,
+                                   binsperobiny, binsperobinz);
     int i, j, k;
-    int w = 0;
-    int box[3];
-    if (binx < nbinx && biny < nbiny && binz < nbinz) {
-        box[0] = box[1] = box[2] = 0;
-        i = binx;
-        j = biny;
-        k = binz;
-        int binidx =
-            calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz);
-        if (binx % binsperobinx == 0) {
-            i = binx - 2;
-            box[0] = (i < 0);
-            i = i < 0 ? i + nbinx : i;
-            w = 1;
-        }
-        if (binx % binsperobinx == binsperobinx - 1) {
-            i = binx + 2;
-            box[0] = (i > nbinx) * 2;
-            i = (i > nbinx) ? i - nbinx : i;
-            w = 1;
-        }
-        if (biny % binsperobiny == 0) {
-            j = biny - 2;
-            box[1] = (j < 0);
-            j = j < 0 ? j + nbiny : j;
-            w = 1;
-        }
-        if (biny % binsperobiny == binsperobiny - 1) {
-            j = biny + 2;
-            box[1] = (j > nbiny) * 2;
-            j = (j > nbiny) ? j - nbiny : j;
-            w = 1;
-        }
-        if (binz % binsperobinz == 0) {
-            k = binz - 2;
-            box[2] = (k < 0);
-            k = k < 0 ? k + nbinz : k;
-            w = 1;
-        }
-        if (binz % binsperobinz == binsperobinz - 1) {
-            k = binz + 2;
-            box[2] = (k > nbinz) * 2;
-            k = (k > nbinz) ? k - nbinz : k;
-            w = 1;
-        }
-        int corbinidx = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, binsperobinz);
-        if (w == 1) {
-            for (int n = 0; n < binsize[binidx]; n++) {
-                index[binstartpts[binidx] + n] =
-                    M * (box[0] + box[1] * 3 + box[2] * 9) + index[binstartpts[corbinidx] + n];
-            }
-        }
+    i = binx;
+    j = biny;
+    k = binz;
+    if (binx % binsperobinx == 0) {
+      i = binx - 2;
+      i = i < 0 ? i + nbinx : i;
+    }
+    if (binx % binsperobinx == binsperobinx - 1) {
+      i = binx + 2;
+      i = (i >= nbinx) ? i - nbinx : i;
+    }
+    if (biny % binsperobiny == 0) {
+      j = biny - 2;
+      j = j < 0 ? j + nbiny : j;
+    }
+    if (biny % binsperobiny == binsperobiny - 1) {
+      j = biny + 2;
+      j = (j >= nbiny) ? j - nbiny : j;
+    }
+    if (binz % binsperobinz == 0) {
+      k = binz - 2;
+      k = k < 0 ? k + nbinz : k;
+    }
+    if (binz % binsperobinz == binsperobinz - 1) {
+      k = binz + 2;
+      k = (k >= nbinz) ? k - nbinz : k;
+    }
+    int idxtoupdate = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx,
+                                        binsperobiny, binsperobinz);
+    if (idxtoupdate != binidx) {
+      binsize[binidx] = binsize[idxtoupdate];
+    }
+  }
+}
+
+__global__ void ghost_bin_pts_index(int binsperobinx, int binsperobiny, int binsperobinz,
+                                    int nobinx, int nobiny, int nobinz, int *binsize,
+                                    int *index, int *binstartpts, int M) {
+  int binx  = threadIdx.x + blockIdx.x * blockDim.x;
+  int biny  = threadIdx.y + blockIdx.y * blockDim.y;
+  int binz  = threadIdx.z + blockIdx.z * blockDim.z;
+  int nbinx = nobinx * binsperobinx;
+  int nbiny = nobiny * binsperobiny;
+  int nbinz = nobinz * binsperobinz;
+
+  int i, j, k;
+  int w = 0;
+  int box[3];
+  if (binx < nbinx && biny < nbiny && binz < nbinz) {
+    box[0] = box[1] = box[2] = 0;
+    i                        = binx;
+    j                        = biny;
+    k                        = binz;
+    int binidx = calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx,
+                                   binsperobiny, binsperobinz);
+    if (binx % binsperobinx == 0) {
+      i      = binx - 2;
+      box[0] = (i < 0);
+      i      = i < 0 ? i + nbinx : i;
+      w      = 1;
+    }
+    if (binx % binsperobinx == binsperobinx - 1) {
+      i      = binx + 2;
+      box[0] = (i > nbinx) * 2;
+      i      = (i > nbinx) ? i - nbinx : i;
+      w      = 1;
+    }
+    if (biny % binsperobiny == 0) {
+      j      = biny - 2;
+      box[1] = (j < 0);
+      j      = j < 0 ? j + nbiny : j;
+      w      = 1;
+    }
+    if (biny % binsperobiny == binsperobiny - 1) {
+      j      = biny + 2;
+      box[1] = (j > nbiny) * 2;
+      j      = (j > nbiny) ? j - nbiny : j;
+      w      = 1;
+    }
+    if (binz % binsperobinz == 0) {
+      k      = binz - 2;
+      box[2] = (k < 0);
+      k      = k < 0 ? k + nbinz : k;
+      w      = 1;
+    }
+    if (binz % binsperobinz == binsperobinz - 1) {
+      k      = binz + 2;
+      box[2] = (k > nbinz) * 2;
+      k      = (k > nbinz) ? k - nbinz : k;
+      w      = 1;
+    }
+    int corbinidx = calc_global_index(i, j, k, nobinx, nobiny, nobinz, binsperobinx,
+                                      binsperobiny, binsperobinz);
+    if (w == 1) {
+      for (int n = 0; n < binsize[binidx]; n++) {
+        index[binstartpts[binidx] + n] =
+            M * (box[0] + box[1] * 3 + box[2] * 9) + index[binstartpts[corbinidx] + n];
+      }
     }
+  }
 }
 
 } // namespace common
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index f129f73b7..6ff91f8ca 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -13,7 +13,7 @@
 namespace cufinufft {
 namespace spreadinterp {
 
-template <typename T>
+template<typename T>
 int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth)
 // Initializes spreader kernel parameters given desired NUFFT tolerance eps,
 // upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), and ker eval meth
@@ -22,70 +22,76 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
 // Must call before any kernel evals done.
 // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h)
 {
-    if (upsampfac != 2.0) { // nonstandard sigma
-        if (kerevalmeth == 1) {
-            fprintf(stderr, "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n", __func__,
-                    upsampfac);
-            return FINUFFT_ERR_HORNER_WRONG_BETA;
-        }
-        if (upsampfac <= 1.0) {
-            fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
-            return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
-        }
-        // calling routine must abort on above errors, since opts is garbage!
-        if (upsampfac > 4.0)
-            fprintf(stderr, "[%s] warning: upsampfac=%.3g is too large to be beneficial!\n", __func__, upsampfac);
+  if (upsampfac != 2.0) { // nonstandard sigma
+    if (kerevalmeth == 1) {
+      fprintf(stderr,
+              "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n",
+              __func__, upsampfac);
+      return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
+    if (upsampfac <= 1.0) {
+      fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
+      return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
+    }
+    // calling routine must abort on above errors, since opts is garbage!
+    if (upsampfac > 4.0)
+      fprintf(stderr, "[%s] warning: upsampfac=%.3g is too large to be beneficial!\n",
+              __func__, upsampfac);
+  }
 
-    // defaults... (user can change after this function called)
-    opts.spread_direction = 1; // user should always set to 1 or 2 as desired
-    opts.upsampfac = upsampfac;
+  // defaults... (user can change after this function called)
+  opts.spread_direction = 1; // user should always set to 1 or 2 as desired
+  opts.upsampfac        = upsampfac;
 
-    // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach...
-    int ier = 0;
+  // as in FINUFFT v2.0, allow too-small-eps by truncating to eps_mach...
+  int ier = 0;
 
-    constexpr T EPSILON = std::numeric_limits<T>::epsilon();
-    if (eps < EPSILON) {
-        fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n", (double)eps,
-                (double)EPSILON);
-        eps = EPSILON;
-        ier = FINUFFT_WARN_EPS_TOO_SMALL;
-    }
+  constexpr T EPSILON = std::numeric_limits<T>::epsilon();
+  if (eps < EPSILON) {
+    fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n",
+            (double)eps, (double)EPSILON);
+    eps = EPSILON;
+    ier = FINUFFT_WARN_EPS_TOO_SMALL;
+  }
 
-    // Set kernel width w (aka ns) and ES kernel beta parameter, in opts...
-    int ns = std::ceil(-log10(eps / (T)10.0));                           // 1 digit per power of ten
-    if (upsampfac != 2.0)                                                // override ns for custom sigma
-        ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1 - 1 / upsampfac))); // formula, gamma=1
-    ns = std::max(2, ns);                                                // we don't have ns=1 version yet
-    if (ns > MAX_NSPREAD) {                                              // clip to match allocated arrays
-        fprintf(stderr, "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; clipping to max %d.\n",
-                __func__, upsampfac, (double)eps, ns, MAX_NSPREAD);
-        ns = MAX_NSPREAD;
-        ier = FINUFFT_WARN_EPS_TOO_SMALL;
-    }
-    opts.nspread = ns;
-    opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner)
-    opts.ES_c = 4.0 / (T)(ns * ns);
+  // Set kernel width w (aka ns) and ES kernel beta parameter, in opts...
+  int ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of ten
+  if (upsampfac != 2.0)                      // override ns for custom sigma
+    ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1 - 1 / upsampfac))); // formula,
+                                                                     // gamma=1
+  ns = std::max(2, ns);   // we don't have ns=1 version yet
+  if (ns > MAX_NSPREAD) { // clip to match allocated arrays
+    fprintf(stderr,
+            "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; "
+            "clipping to max %d.\n",
+            __func__, upsampfac, (double)eps, ns, MAX_NSPREAD);
+    ns  = MAX_NSPREAD;
+    ier = FINUFFT_WARN_EPS_TOO_SMALL;
+  }
+  opts.nspread      = ns;
+  opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner)
+  opts.ES_c         = 4.0 / (T)(ns * ns);
 
-    T betaoverns = 2.30; // gives decent betas for default sigma=2.0
-    if (ns == 2)
-        betaoverns = 2.20; // some small-width tweaks...
-    if (ns == 3)
-        betaoverns = 2.26;
-    if (ns == 4)
-        betaoverns = 2.38;
-    if (upsampfac != 2.0) {                                       // again, override beta for custom sigma
-        T gamma = 0.97;                                           // must match devel/gen_all_horner_C_code.m
-        betaoverns = gamma * T(M_PI) * (1 - 1 / (2 * upsampfac)); // formula based on cutoff
-    }
-    opts.ES_beta = betaoverns * (T)ns; // set the kernel beta parameter
-    // fprintf(stderr,"setup_spreader: sigma=%.6f, chose ns=%d beta=%.6f\n",(double)upsampfac,ns,(double)opts.ES_beta);
-    // // user hasn't set debug yet
-    return ier;
+  T betaoverns = 2.30;            // gives decent betas for default sigma=2.0
+  if (ns == 2) betaoverns = 2.20; // some small-width tweaks...
+  if (ns == 3) betaoverns = 2.26;
+  if (ns == 4) betaoverns = 2.38;
+  if (upsampfac != 2.0) { // again, override beta for custom sigma
+    T gamma    = 0.97;    // must match devel/gen_all_horner_C_code.m
+    betaoverns = gamma * T(M_PI) * (1 - 1 / (2 * upsampfac)); // formula based on
+                                                              // cutoff
+  }
+  opts.ES_beta = betaoverns * (T)ns; // set the kernel beta parameter
+  // fprintf(stderr,"setup_spreader: sigma=%.6f, chose ns=%d
+  // beta=%.6f\n",(double)upsampfac,ns,(double)opts.ES_beta);
+  // // user hasn't set debug yet
+  return ier;
 }
 
-template int setup_spreader(finufft_spread_opts &opts, float eps, float upsampfac, int kerevalmeth);
-template int setup_spreader(finufft_spread_opts &opts, double eps, double upsampfac, int kerevalmeth);
+template int setup_spreader(finufft_spread_opts &opts, float eps, float upsampfac,
+                            int kerevalmeth);
+template int setup_spreader(finufft_spread_opts &opts, double eps, double upsampfac,
+                            int kerevalmeth);
 template float evaluate_kernel(float x, const finufft_spread_opts &opts);
 template double evaluate_kernel(double x, const finufft_spread_opts &opts);
 
diff --git a/src/cuda/utils.cpp b/src/cuda/utils.cpp
index 1c10f3453..9c3003cb8 100644
--- a/src/cuda/utils.cpp
+++ b/src/cuda/utils.cpp
@@ -9,23 +9,18 @@ CUFINUFFT_BIGINT next235beven(CUFINUFFT_BIGINT n, CUFINUFFT_BIGINT b)
 // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
 // added condition about b Melody 05/31/20
 {
-    if (n <= 2)
-        return 2;
-    if (n % 2 == 1)
-        n += 1;                     // even
-    CUFINUFFT_BIGINT nplus = n - 2; // to cancel out the +=2 at start of loop
-    CUFINUFFT_BIGINT numdiv = 2;    // a dummy that is >1
-    while ((numdiv > 1) || (nplus % b != 0)) {
-        nplus += 2; // stays even
-        numdiv = nplus;
-        while (numdiv % 2 == 0)
-            numdiv /= 2; // remove all factors of 2,3,5...
-        while (numdiv % 3 == 0)
-            numdiv /= 3;
-        while (numdiv % 5 == 0)
-            numdiv /= 5;
-    }
-    return nplus;
+  if (n <= 2) return 2;
+  if (n % 2 == 1) n += 1;                // even
+  CUFINUFFT_BIGINT nplus  = n - 2;       // to cancel out the +=2 at start of loop
+  CUFINUFFT_BIGINT numdiv = 2;           // a dummy that is >1
+  while ((numdiv > 1) || (nplus % b != 0)) {
+    nplus += 2;                          // stays even
+    numdiv = nplus;
+    while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5...
+    while (numdiv % 3 == 0) numdiv /= 3;
+    while (numdiv % 5 == 0) numdiv /= 5;
+  }
+  return nplus;
 }
 
 // ----------------------- helpers for timing (always stay double prec)...
@@ -35,19 +30,19 @@ void CNTime::start() { gettimeofday(&initial, 0); }
 double CNTime::restart()
 // Barnett changed to returning in sec
 {
-    double delta = this->elapsedsec();
-    this->start();
-    return delta;
+  double delta = this->elapsedsec();
+  this->start();
+  return delta;
 }
 
 double CNTime::elapsedsec()
 // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
 {
-    struct timeval now;
-    gettimeofday(&now, 0);
-    double nowsec = (double)now.tv_sec + 1e-6 * now.tv_usec;
-    double initialsec = (double)initial.tv_sec + 1e-6 * initial.tv_usec;
-    return nowsec - initialsec;
+  struct timeval now;
+  gettimeofday(&now, 0);
+  double nowsec     = (double)now.tv_sec + 1e-6 * now.tv_usec;
+  double initialsec = (double)initial.tv_sec + 1e-6 * initial.tv_usec;
+  return nowsec - initialsec;
 }
 
 } // namespace utils
diff --git a/src/finufft.cpp b/src/finufft.cpp
index 5b33ef126..03c1d9ac6 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -4,19 +4,19 @@
 // private headers for lib build
 // (must come after finufft.h which clobbers FINUFFT* macros)
 #include <finufft/defs.h>
+#include <finufft/fftw_defs.h>
+#include <finufft/spreadinterp.h>
 #include <finufft/utils.h>
 #include <finufft/utils_precindep.h>
-#include <finufft/spreadinterp.h>
-#include <finufft/fftw_defs.h>
 
-#include <iostream>
+#include "../contrib/legendre_rule_fast.h"
 #include <iomanip>
+#include <iostream>
 #include <math.h>
 #include <mutex>
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
-#include "../contrib/legendre_rule_fast.h"
 
 using namespace std;
 using namespace finufft;
@@ -24,7 +24,6 @@ using namespace finufft::utils;
 using namespace finufft::spreadinterp;
 using namespace finufft::quadrature;
 
-
 /* Computational core for FINUFFT.
 
    Based on Barnett 2017-2018 finufft?d.cpp containing nine drivers, plus
@@ -39,41 +38,41 @@ using namespace finufft::quadrature;
 Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
 
    TYPE 1:
-     The type 1 NUFFT proceeds in three main steps:
-     1) spread data to oversampled regular mesh using kernel.
-     2) compute FFT on uniform mesh
-     3) deconvolve by division of each Fourier mode independently by the kernel
-        Fourier series coeffs (not merely FFT of kernel), shuffle to output.
-     The kernel coeffs are precomputed in what is called step 0 in the code.
+   The type 1 NUFFT proceeds in three main steps:
+   1) spread data to oversampled regular mesh using kernel.
+   2) compute FFT on uniform mesh
+   3) deconvolve by division of each Fourier mode independently by the kernel
+    Fourier series coeffs (not merely FFT of kernel), shuffle to output.
+   The kernel coeffs are precomputed in what is called step 0 in the code.
    Written with FFTW style complex arrays. Step 3a internally uses CPX,
    and Step 3b internally uses real arithmetic and FFTW style complex.
 
    TYPE 2:
-     The type 2 algorithm proceeds in three main steps:
-     1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff
-     2) compute inverse FFT on uniform fine grid
-     3) spread (dir=2, ie interpolate) data to regular mesh
-     The kernel coeffs are precomputed in what is called step 0 in the code.
+   The type 2 algorithm proceeds in three main steps:
+   1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff
+   2) compute inverse FFT on uniform fine grid
+   3) spread (dir=2, ie interpolate) data to regular mesh
+   The kernel coeffs are precomputed in what is called step 0 in the code.
    Written with FFTW style complex arrays. Step 0 internally uses CPX,
    and Step 1 internally uses real arithmetic and FFTW style complex.
 
    TYPE 3:
-     The type 3 algorithm is basically a type 2 (which is implemented precisely
-     as call to type 2) replacing the middle FFT (Step 2) of a type 1.
-     Beyond this, the new twists are:
-     i) nf1, number of upsampled points for the type-1, depends on the product
-       of interval widths containing input and output points (X*S).
-     ii) The deconvolve (post-amplify) step is division by the Fourier transform
-       of the scaled kernel, evaluated on the *nonuniform* output frequency
-       grid; this is done by direct approximation of the Fourier integral
-       using quadrature of the kernel function times exponentials.
-     iii) Shifts in x (real) and s (Fourier) are done to minimize the interval
-       half-widths X and S, hence nf1.
+   The type 3 algorithm is basically a type 2 (which is implemented precisely
+   as call to type 2) replacing the middle FFT (Step 2) of a type 1.
+   Beyond this, the new twists are:
+   i) nf1, number of upsampled points for the type-1, depends on the product
+     of interval widths containing input and output points (X*S).
+   ii) The deconvolve (post-amplify) step is division by the Fourier transform
+     of the scaled kernel, evaluated on the *nonuniform* output frequency
+     grid; this is done by direct approximation of the Fourier integral
+     using quadrature of the kernel function times exponentials.
+   iii) Shifts in x (real) and s (Fourier) are done to minimize the interval
+     half-widths X and S, hence nf1.
    No references to FFTW are needed here. CPX arithmetic is used.
 
    MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1):
-     maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so
-     this is good only for small problems.
+   maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so
+   this is good only for small problems.
 
 
 Design notes for guru interface implementation:
@@ -86,18 +85,16 @@ Design notes for guru interface implementation:
   state apart from that associated with FFTW (and the did_fftw_init).
 */
 
-
-
 // ---------- local math routines (were in common.cpp; no need now): --------
 
 namespace finufft {
-  namespace common {
+namespace common {
 
-  // Technically global state...
-  // Needs to be static to avoid name collision with SINGLE/DOUBLE
-  static std::mutex fftw_lock;
+// Technically global state...
+// Needs to be static to avoid name collision with SINGLE/DOUBLE
+static std::mutex fftw_lock;
 
-  // We macro because it has no FLT args but gets compiled for both prec's...
+// We macro because it has no FLT args but gets compiled for both prec's...
 #ifdef SINGLE
 #define SET_NF_TYPE12 set_nf_type12f
 #else
@@ -108,18 +105,22 @@ int SET_NF_TYPE12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, BIGI
 // and requested number of Fourier modes ms. Returns 0 if success, else an
 // error code if nf was unreasonably big (& tell the world).
 {
-  *nf = (BIGINT)(opts.upsampfac*ms);       // manner of rounding not crucial
-  if (*nf<2*spopts.nspread) *nf=2*spopts.nspread; // otherwise spread fails
-  if (*nf<MAX_NF) {
-    *nf = next235even(*nf);                       // expensive at huge nf
+  *nf = (BIGINT)(opts.upsampfac * ms); // manner of rounding not crucial
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread; // otherwise spread fails
+  if (*nf < MAX_NF) {
+    *nf = next235even(*nf);                               // expensive at huge nf
     return 0;
   } else {
-    fprintf(stderr,"[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a malloc\n",__func__,(double)*nf,(double)MAX_NF);
+    fprintf(stderr,
+            "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a "
+            "malloc\n",
+            __func__, (double)*nf, (double)MAX_NF);
     return FINUFFT_ERR_MAXNALLOC;
   }
 }
 
-int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts opts, int dim)
+int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts opts,
+                             int dim)
 // Set up the spreader parameters given eps, and pass across various nufft
 // options. Return status of setup_spreader. Uses pass-by-ref. Barnett 10/30/17
 {
@@ -127,22 +128,24 @@ int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts
   int ier = setup_spreader(spopts, eps, opts.upsampfac, opts.spread_kerevalmeth,
                            opts.spread_debug, opts.showwarn, dim);
   // override various spread opts from their defaults...
-  spopts.debug = opts.spread_debug;
-  spopts.sort = opts.spread_sort;     // could make dim or CPU choices here?
-  spopts.kerpad = opts.spread_kerpad; // (only applies to kerevalmeth=0)
-  spopts.chkbnds = opts.chkbnds;
-  spopts.nthreads = opts.nthreads;    // 0 passed in becomes omp max by here
-  if (opts.spread_nthr_atomic>=0)     // overrides
+  spopts.debug    = opts.spread_debug;
+  spopts.sort     = opts.spread_sort;   // could make dim or CPU choices here?
+  spopts.kerpad   = opts.spread_kerpad; // (only applies to kerevalmeth=0)
+  spopts.chkbnds  = opts.chkbnds;
+  spopts.nthreads = opts.nthreads;      // 0 passed in becomes omp max by here
+  if (opts.spread_nthr_atomic >= 0)     // overrides
     spopts.atomic_threshold = opts.spread_nthr_atomic;
-  if (opts.spread_max_sp_size>0)      // overrides
+  if (opts.spread_max_sp_size > 0)      // overrides
     spopts.max_subproblem_size = opts.spread_max_sp_size;
-  if (opts.chkbnds != 1)              // deprecated default value hardcoded here
-    fprintf(stderr, "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",__func__);
+  if (opts.chkbnds != 1)                // deprecated default value hardcoded here
+    fprintf(stderr,
+            "[%s] opts.chkbnds is deprecated; ignoring change from default value.\n",
+            __func__);
   return ier;
-} 
+}
 
 void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts,
-		     BIGINT *nf, FLT *h, FLT *gam)
+                   BIGINT *nf, FLT *h, FLT *gam)
 /* sets nf, h (upsampled grid spacing), and gamma (x_j rescaling factor),
    for type 3 only.
    Inputs:
@@ -156,26 +159,27 @@ void set_nhg_type3(FLT S, FLT X, finufft_opts opts, finufft_spread_opts spopts,
    New logic 6/12/17
 */
 {
-  int nss = spopts.nspread + 1;      // since ns may be odd
-  FLT Xsafe=X, Ssafe=S;              // may be tweaked locally
-  if (X==0.0)                        // logic ensures XS>=1, handle X=0 a/o S=0
-    if (S==0.0) {
-      Xsafe=1.0;
-      Ssafe=1.0;
-    } else Xsafe = max(Xsafe, 1/S);
+  int nss   = spopts.nspread + 1; // since ns may be odd
+  FLT Xsafe = X, Ssafe = S;       // may be tweaked locally
+  if (X == 0.0)                   // logic ensures XS>=1, handle X=0 a/o S=0
+    if (S == 0.0) {
+      Xsafe = 1.0;
+      Ssafe = 1.0;
+    } else
+      Xsafe = max(Xsafe, 1 / S);
   else
-    Ssafe = max(Ssafe, 1/X);
+    Ssafe = max(Ssafe, 1 / X);
   // use the safe X and S...
-  FLT nfd = 2.0*opts.upsampfac*Ssafe*Xsafe/PI + nss;
-  if (!isfinite(nfd)) nfd=0.0;                // use FLT to catch inf
+  FLT nfd = 2.0 * opts.upsampfac * Ssafe * Xsafe / PI + nss;
+  if (!isfinite(nfd)) nfd = 0.0; // use FLT to catch inf
   *nf = (BIGINT)nfd;
-  //printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
-  // catch too small nf, and nan or +-inf, otherwise spread fails...
-  if (*nf<2*spopts.nspread) *nf=2*spopts.nspread;
-  if (*nf<MAX_NF)                             // otherwise will fail anyway
-    *nf = next235even(*nf);                   // expensive at huge nf
-  *h = 2*PI / *nf;                            // upsampled grid spacing
-  *gam = (FLT)*nf / (2.0*opts.upsampfac*Ssafe);  // x scale fac to x'
+  // printf("initial nf=%lld, ns=%d\n",*nf,spopts.nspread);
+  //  catch too small nf, and nan or +-inf, otherwise spread fails...
+  if (*nf < 2 * spopts.nspread) *nf = 2 * spopts.nspread;
+  if (*nf < MAX_NF)                                 // otherwise will fail anyway
+    *nf = next235even(*nf);                         // expensive at huge nf
+  *h   = 2 * PI / *nf;                              // upsampled grid spacing
+  *gam = (FLT)*nf / (2.0 * opts.upsampfac * Ssafe); // x scale fac to x'
 }
 
 void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
@@ -194,8 +198,8 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
 
   Outputs:
   fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive,
-              divided by h = 2pi/n.
-              (should be allocated for at least nf/2+1 FLTs)
+        divided by h = 2pi/n.
+        (should be allocated for at least nf/2+1 FLTs)
 
   Compare onedim_dct_kernel which has same interface, but computes DFT of
   sampled kernel, not quite the same object.
@@ -204,34 +208,34 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
   Fixed num_threads 7/20/20
  */
 {
-  FLT J2 = opts.nspread/2.0;            // J/2, half-width of ker z-support
+  FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
   // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q=(int)(2 + 3.0*J2);  // not sure why so large? cannot exceed MAX_NQUAD
+  int q = (int)(2 + 3.0 * J2); // not sure why so large? cannot exceed MAX_NQUAD
   FLT f[MAX_NQUAD];
-  double z[2*MAX_NQUAD], w[2*MAX_NQUAD];
-  legendre_compute_glr(2*q,z,w);        // only half the nodes used, eg on (0,1)
+  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD];
+  legendre_compute_glr(2 * q, z, w); // only half the nodes used, eg on (0,1)
   std::complex<FLT> a[MAX_NQUAD];
-  for (int n=0;n<q;++n) {               // set up nodes z_n and vals f_n
-    z[n] *= J2;                         // rescale nodes
-    f[n] = J2*(FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei
-    a[n] = exp(2*PI*IMA*(FLT)(nf/2-z[n])/(FLT)nf);  // phase winding rates
+  for (int n = 0; n < q; ++n) {      // set up nodes z_n and vals f_n
+    z[n] *= J2;                      // rescale nodes
+    f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts);  // vals & quadr wei
+    a[n] = exp(2 * PI * IMA * (FLT)(nf / 2 - z[n]) / (FLT)nf); // phase winding rates
   }
-  BIGINT nout=nf/2+1;                   // how many values we're writing to
-  int nt = min(nout,(BIGINT)opts.nthreads);         // how many chunks
-  std::vector<BIGINT> brk(nt+1);        // start indices for each thread
-  for (int t=0; t<=nt; ++t)             // split nout mode indices btw threads
-    brk[t] = (BIGINT)(0.5 + nout*t/(double)nt);
+  BIGINT nout = nf / 2 + 1;                       // how many values we're writing to
+  int nt      = min(nout, (BIGINT)opts.nthreads); // how many chunks
+  std::vector<BIGINT> brk(nt + 1);                // start indices for each thread
+  for (int t = 0; t <= nt; ++t)                   // split nout mode indices btw threads
+    brk[t] = (BIGINT)(0.5 + nout * t / (double)nt);
 #pragma omp parallel num_threads(nt)
-  {                                     // each thread gets own chunk to do
+  {                                                // each thread gets own chunk to do
     int t = MY_OMP_GET_THREAD_NUM();
-    std::complex<FLT> aj[MAX_NQUAD];    // phase rotator for this thread
-    for (int n=0;n<q;++n)
-      aj[n] = pow(a[n],(FLT)brk[t]);    // init phase factors for chunk
-    for (BIGINT j=brk[t];j<brk[t+1];++j) {          // loop along output array
-      FLT x = 0.0;                      // accumulator for answer at this j
-      for (int n=0;n<q;++n) {
-        x += f[n] * 2*real(aj[n]);      // include the negative freq
-        aj[n] *= a[n];                  // wind the phases
+    std::complex<FLT> aj[MAX_NQUAD];               // phase rotator for this thread
+    for (int n = 0; n < q; ++n)
+      aj[n] = pow(a[n], (FLT)brk[t]);              // init phase factors for chunk
+    for (BIGINT j = brk[t]; j < brk[t + 1]; ++j) { // loop along output array
+      FLT x = 0.0;                                 // accumulator for answer at this j
+      for (int n = 0; n < q; ++n) {
+        x += f[n] * 2 * real(aj[n]);               // include the negative freq
+        aj[n] *= a[n];                             // wind the phases
       }
       fwkerhalf[j] = x;
     }
@@ -249,8 +253,8 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts
   Inputs:
   nk - number of freqs
   k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z)
-       Note, z is in grid-point units, and k values must be in [-pi, pi) for
-       accuracy.
+     Note, z is in grid-point units, and k values must be in [-pi, pi) for
+     accuracy.
   opts - spreading opts object, needed to eval kernel (must be already set up)
 
   Outputs:
@@ -259,38 +263,39 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts
   Barnett 2/8/17. openmp since cos slow 2/9/17
  */
 {
-  FLT J2 = opts.nspread/2.0;        // J/2, half-width of ker z-support
+  FLT J2 = opts.nspread / 2.0; // J/2, half-width of ker z-support
   // # quadr nodes in z (from 0 to J/2; reflections will be added)...
-  int q=(int)(2 + 2.0*J2);     // > pi/2 ratio.  cannot exceed MAX_NQUAD
-  if (opts.debug) printf("q (# ker FT quadr pts) = %d\n",q);
-  FLT f[MAX_NQUAD]; double z[2*MAX_NQUAD],w[2*MAX_NQUAD];   // glr needs double
-  legendre_compute_glr(2*q,z,w);        // only half the nodes used, eg on (0,1)
-  for (int n=0;n<q;++n) {
-    z[n] *= (FLT)J2;                    // quadr nodes for [0,J/2]
-    f[n] = J2*(FLT)w[n] * evaluate_kernel((FLT)z[n], opts);  // w/ quadr weights
-    //printf("f[%d] = %.3g\n",n,f[n]);
+  int q = (int)(2 + 2.0 * J2); // > pi/2 ratio.  cannot exceed MAX_NQUAD
+  if (opts.debug) printf("q (# ker FT quadr pts) = %d\n", q);
+  FLT f[MAX_NQUAD];
+  double z[2 * MAX_NQUAD], w[2 * MAX_NQUAD]; // glr needs double
+  legendre_compute_glr(2 * q, z, w);         // only half the nodes used, eg on (0,1)
+  for (int n = 0; n < q; ++n) {
+    z[n] *= (FLT)J2;                         // quadr nodes for [0,J/2]
+    f[n] = J2 * (FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // w/ quadr weights
+    // printf("f[%d] = %.3g\n",n,f[n]);
   }
 #pragma omp parallel for num_threads(opts.nthreads)
-  for (BIGINT j=0;j<nk;++j) {          // loop along output array
-    FLT x = 0.0;                       // register
-    for (int n=0;n<q;++n)
-      x += f[n] * 2*cos(k[j]*(FLT)z[n]);  // pos & neg freq pair.  use FLT cos!
+  for (BIGINT j = 0; j < nk; ++j) {          // loop along output array
+    FLT x = 0.0;                             // register
+    for (int n = 0; n < q; ++n)
+      x += f[n] * 2 * cos(k[j] * (FLT)z[n]); // pos & neg freq pair.  use FLT cos!
     phihat[j] = x;
   }
-}  
+}
 
-void deconvolveshuffle1d(int dir,FLT prefac,FLT* ker, BIGINT ms,
-			 FLT *fk, BIGINT nf1, FFTW_CPX* fw, int modeord)
+void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGINT nf1,
+                         FFTW_CPX *fw, int modeord)
 /*
   if dir==1: copies fw to fk with amplification by prefac/ker
   if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
 
   modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1)
-          1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1).
+      1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1).
 
   fk is size-ms FLT complex array (2*ms FLTs alternating re,im parts)
   fw is a FFTW style complex array, ie FLT [nf1][2], essentially FLTs
-       alternating re,im parts.
+     alternating re,im parts.
   ker is real-valued FLT array of length nf1/2+1.
 
   Single thread only, but shouldn't matter since mostly data movement.
@@ -300,43 +305,46 @@ void deconvolveshuffle1d(int dir,FLT prefac,FLT* ker, BIGINT ms,
   This could be removed by passing in an inverse kernel and doing mults.
 
   todo: rewrite w/ C++-complex I/O, check complex divide not slower than
-        real divide, or is there a way to force a real divide?
+    real divide, or is there a way to force a real divide?
 
   Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17
 */
 {
-  BIGINT kmin = -ms/2, kmax = (ms-1)/2;    // inclusive range of k indices
-  if (ms==0) kmax=-1;           // fixes zero-pad for trivial no-mode case
+  BIGINT kmin = -ms / 2, kmax = (ms - 1) / 2; // inclusive range of k indices
+  if (ms == 0) kmax = -1;                     // fixes zero-pad for trivial no-mode case
   // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2*kmin, pn = 0;       // CMCL mode-ordering case (2* since cmplx)
-  if (modeord==1) { pp = 0; pn = 2*(kmax+1); }   // or, instead, FFT ordering
-  if (dir==1) {    // read fw, write out to fk...
-    for (BIGINT k=0;k<=kmax;++k) {                    // non-neg freqs k
-      fk[pp++] = prefac * fw[k][0] / ker[k];          // re
-      fk[pp++] = prefac * fw[k][1] / ker[k];          // im
+  BIGINT pp = -2 * kmin, pn = 0; // CMCL mode-ordering case (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (kmax + 1);
+  } // or, instead, FFT ordering
+  if (dir == 1) {                                   // read fw, write out to fk...
+    for (BIGINT k = 0; k <= kmax; ++k) {            // non-neg freqs k
+      fk[pp++] = prefac * fw[k][0] / ker[k];        // re
+      fk[pp++] = prefac * fw[k][1] / ker[k];        // im
+    }
+    for (BIGINT k = kmin; k < 0; ++k) {             // neg freqs k
+      fk[pn++] = prefac * fw[nf1 + k][0] / ker[-k]; // re
+      fk[pn++] = prefac * fw[nf1 + k][1] / ker[-k]; // im
     }
-    for (BIGINT k=kmin;k<0;++k) {                     // neg freqs k
-      fk[pn++] = prefac * fw[nf1+k][0] / ker[-k];     // re
-      fk[pn++] = prefac * fw[nf1+k][1] / ker[-k];     // im
+  } else { // read fk, write out to fw w/ zero padding...
+    for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where
+                                                     // needed
+      fw[k][0] = fw[k][1] = 0.0;
     }
-  } else {    // read fk, write out to fw w/ zero padding...
-    for (BIGINT k=kmax+1; k<nf1+kmin; ++k) {  // zero pad precisely where needed
-      fw[k][0] = fw[k][1] = 0.0; }
-    for (BIGINT k=0;k<=kmax;++k) {                    // non-neg freqs k
-      fw[k][0] = prefac * fk[pp++] / ker[k];          // re
-      fw[k][1] = prefac * fk[pp++] / ker[k];          // im
+    for (BIGINT k = 0; k <= kmax; ++k) {            // non-neg freqs k
+      fw[k][0] = prefac * fk[pp++] / ker[k];        // re
+      fw[k][1] = prefac * fk[pp++] / ker[k];        // im
     }
-    for (BIGINT k=kmin;k<0;++k) {                     // neg freqs k
-      fw[nf1+k][0] = prefac * fk[pn++] / ker[-k];     // re
-      fw[nf1+k][1] = prefac * fk[pn++] / ker[-k];     // im
+    for (BIGINT k = kmin; k < 0; ++k) {             // neg freqs k
+      fw[nf1 + k][0] = prefac * fk[pn++] / ker[-k]; // re
+      fw[nf1 + k][1] = prefac * fk[pn++] / ker[-k]; // im
     }
   }
 }
 
-void deconvolveshuffle2d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
-			 BIGINT ms, BIGINT mt,
-			 FLT *fk, BIGINT nf1, BIGINT nf2, FFTW_CPX* fw,
-			 int modeord)
+void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, BIGINT mt,
+                         FLT *fk, BIGINT nf1, BIGINT nf2, FFTW_CPX *fw, int modeord)
 /*
   2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac.
 
@@ -344,37 +352,42 @@ void deconvolveshuffle2d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
   if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
 
   modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
-          1: use FFT-style (pos then negative, on each dim)
+      1: use FFT-style (pos then negative, on each dim)
 
   fk is complex array stored as 2*ms*mt FLTs alternating re,im parts, with
-    ms looped over fast and mt slow.
+  ms looped over fast and mt slow.
   fw is a FFTW style complex array, ie FLT [nf1*nf2][2], essentially FLTs
-       alternating re,im parts; again nf1 is fast and nf2 slow.
+     alternating re,im parts; again nf1 is fast and nf2 slow.
   ker1, ker2 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1
-       respectively.
+     respectively.
 
   Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17
 */
 {
-  BIGINT k2min = -mt/2, k2max = (mt-1)/2;    // inclusive range of k2 indices
-  if (mt==0) k2max=-1;           // fixes zero-pad for trivial no-mode case
+  BIGINT k2min = -mt / 2, k2max = (mt - 1) / 2; // inclusive range of k2 indices
+  if (mt == 0) k2max = -1;                      // fixes zero-pad for trivial no-mode case
   // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2*k2min*ms, pn = 0;   // CMCL mode-ordering case (2* since cmplx)
-  if (modeord==1) { pp = 0; pn = 2*(k2max+1)*ms; }  // or, instead, FFT ordering
-  if (dir==2)               // zero pad needed x-lines (contiguous in memory)
-    for (BIGINT j=nf1*(k2max+1); j<nf1*(nf2+k2min); ++j)  // sweeps all dims
+  BIGINT pp = -2 * k2min * ms, pn = 0; // CMCL mode-ordering case (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (k2max + 1) * ms;
+  } // or, instead, FFT ordering
+  if (dir == 2) // zero pad needed x-lines (contiguous in memory)
+    for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all
+                                                                     // dims
       fw[j][0] = fw[j][1] = 0.0;
-  for (BIGINT k2=0;k2<=k2max;++k2, pp+=2*ms)          // non-neg y-freqs
+  for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms)               // non-neg y-freqs
     // point fk and fw to the start of this y value's row (2* is for complex):
-    common::deconvolveshuffle1d(dir,prefac/ker2[k2],ker1,ms,fk + pp,nf1,&fw[nf1*k2],modeord);
-  for (BIGINT k2=k2min;k2<0;++k2, pn+=2*ms)           // neg y-freqs
-    common::deconvolveshuffle1d(dir,prefac/ker2[-k2],ker1,ms,fk + pn,nf1,&fw[nf1*(nf2+k2)],modeord);
+    common::deconvolveshuffle1d(dir, prefac / ker2[k2], ker1, ms, fk + pp, nf1,
+                                &fw[nf1 * k2], modeord);
+  for (BIGINT k2 = k2min; k2 < 0; ++k2, pn += 2 * ms) // neg y-freqs
+    common::deconvolveshuffle1d(dir, prefac / ker2[-k2], ker1, ms, fk + pn, nf1,
+                                &fw[nf1 * (nf2 + k2)], modeord);
 }
 
-void deconvolveshuffle3d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
-			 FLT *ker3, BIGINT ms, BIGINT mt, BIGINT mu,
-			 FLT *fk, BIGINT nf1, BIGINT nf2, BIGINT nf3,
-			 FFTW_CPX* fw, int modeord)
+void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT ms,
+                         BIGINT mt, BIGINT mu, FLT *fk, BIGINT nf1, BIGINT nf2,
+                         BIGINT nf3, FFTW_CPX *fw, int modeord)
 /*
   3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac.
 
@@ -382,40 +395,42 @@ void deconvolveshuffle3d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
   if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
 
   modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing)
-          1: use FFT-style (pos then negative, on each dim)
+      1: use FFT-style (pos then negative, on each dim)
 
   fk is complex array stored as 2*ms*mt*mu FLTs alternating re,im parts, with
-    ms looped over fastest and mu slowest.
+  ms looped over fastest and mu slowest.
   fw is a FFTW style complex array, ie FLT [nf1*nf2*nf3][2], effectively
-       FLTs alternating re,im parts; again nf1 is fastest and nf3 slowest.
+     FLTs alternating re,im parts; again nf1 is fastest and nf3 slowest.
   ker1, ker2, ker3 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1,
-       and nf3/2+1 respectively.
+     and nf3/2+1 respectively.
 
   Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17
 */
 {
-  BIGINT k3min = -mu/2, k3max = (mu-1)/2;    // inclusive range of k3 indices
-  if (mu==0) k3max=-1;           // fixes zero-pad for trivial no-mode case
+  BIGINT k3min = -mu / 2, k3max = (mu - 1) / 2; // inclusive range of k3 indices
+  if (mu == 0) k3max = -1;                      // fixes zero-pad for trivial no-mode case
   // set up pp & pn as ptrs to start of pos(ie nonneg) & neg chunks of fk array
-  BIGINT pp = -2*k3min*ms*mt, pn = 0; // CMCL mode-ordering (2* since cmplx)
-  if (modeord==1) { pp = 0; pn = 2*(k3max+1)*ms*mt; }  // or FFT ordering
-  BIGINT np = nf1*nf2;  // # pts in an upsampled Fourier xy-plane
-  if (dir==2)           // zero pad needed xy-planes (contiguous in memory)
-    for (BIGINT j=np*(k3max+1);j<np*(nf3+k3min);++j)  // sweeps all dims
+  BIGINT pp = -2 * k3min * ms * mt, pn = 0; // CMCL mode-ordering (2* since cmplx)
+  if (modeord == 1) {
+    pp = 0;
+    pn = 2 * (k3max + 1) * ms * mt;
+  } // or FFT ordering
+  BIGINT np = nf1 * nf2; // # pts in an upsampled Fourier xy-plane
+  if (dir == 2)          // zero pad needed xy-planes (contiguous in memory)
+    for (BIGINT j = np * (k3max + 1); j < np * (nf3 + k3min); ++j) // sweeps all dims
       fw[j][0] = fw[j][1] = 0.0;
-  for (BIGINT k3=0;k3<=k3max;++k3, pp+=2*ms*mt)      // non-neg z-freqs
+  for (BIGINT k3 = 0; k3 <= k3max; ++k3, pp += 2 * ms * mt)        // non-neg z-freqs
     // point fk and fw to the start of this z value's plane (2* is for complex):
-    common::deconvolveshuffle2d(dir,prefac/ker3[k3],ker1,ker2,ms,mt,
-			fk + pp,nf1,nf2,&fw[np*k3],modeord);
-  for (BIGINT k3=k3min;k3<0;++k3, pn+=2*ms*mt)       // neg z-freqs
-    common::deconvolveshuffle2d(dir,prefac/ker3[-k3],ker1,ker2,ms,mt,
-			fk + pn,nf1,nf2,&fw[np*(nf3+k3)],modeord);
+    common::deconvolveshuffle2d(dir, prefac / ker3[k3], ker1, ker2, ms, mt, fk + pp, nf1,
+                                nf2, &fw[np * k3], modeord);
+  for (BIGINT k3 = k3min; k3 < 0; ++k3, pn += 2 * ms * mt) // neg z-freqs
+    common::deconvolveshuffle2d(dir, prefac / ker3[-k3], ker1, ker2, ms, mt, fk + pn, nf1,
+                                nf2, &fw[np * (nf3 + k3)], modeord);
 }
 
-
 // --------- batch helper functions for t1,2 exec: ---------------------------
 
-int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch)
+int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *cBatch)
 /*
   Spreads (or interpolates) a batch of batchSize strength vectors in cBatch
   to (or from) the batch of fine working grids p->fwBatch, using the same set of
@@ -424,7 +439,7 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch)
   Returns 0 (no error reporting for now).
   Notes:
   1) cBatch is already assumed to have the correct offset, ie here we
-     read from the start of cBatch (unlike Malleo). fwBatch also has zero offset
+   read from the start of cBatch (unlike Malleo). fwBatch also has zero offset
   2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp
   Barnett 5/19/20, based on Malleo 2019.
 */
@@ -433,19 +448,19 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch)
   // omp_sets_nested deprecated, so don't use; assume not nested for 2 to work.
   // But when nthr_outer=1 here, omp par inside the loop sees all threads...
 #ifdef _OPENMP
-  int nthr_outer = p->opts.spread_thread==1 ? 1 : batchSize;
+  int nthr_outer = p->opts.spread_thread == 1 ? 1 : batchSize;
 #endif
 #pragma omp parallel for num_threads(nthr_outer)
-  for (int i=0; i<batchSize; i++) {
-    FFTW_CPX *fwi = p->fwBatch + i*p->nf;  // start of i'th fw array in wkspace
-    CPX *ci = cBatch + i*p->nj;            // start of i'th c array in cBatch
-    spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT*)fwi, p->nj,
-                       p->X, p->Y, p->Z, (FLT*)ci, p->spopts, p->didSort);
+  for (int i = 0; i < batchSize; i++) {
+    FFTW_CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
+    CPX *ci       = cBatch + i * p->nj;     // start of i'th c array in cBatch
+    spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT *)fwi, p->nj, p->X,
+                       p->Y, p->Z, (FLT *)ci, p->spopts, p->didSort);
   }
   return 0;
 }
 
-int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
+int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX *fkBatch)
 /*
   Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch
   into each output array fk in fkBatch.
@@ -459,29 +474,25 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
 {
   // since deconvolveshuffle?d are single-thread, omp par seems to help here...
 #pragma omp parallel for num_threads(batchSize)
-  for (int i=0; i<batchSize; i++) {
-    FFTW_CPX *fwi = p->fwBatch + i*p->nf;  // start of i'th fw array in wkspace
-    CPX *fki = fkBatch + i*p->N;           // start of i'th fk array in fkBatch
-    
+  for (int i = 0; i < batchSize; i++) {
+    FFTW_CPX *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace
+    CPX *fki      = fkBatch + i * p->N;     // start of i'th fk array in fkBatch
+
     // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0...
     if (p->dim == 1)
-      deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1,
-                          p->ms, (FLT *)fki,
+      deconvolveshuffle1d(p->spopts.spread_direction, 1.0, p->phiHat1, p->ms, (FLT *)fki,
                           p->nf1, fwi, p->opts.modeord);
     else if (p->dim == 2)
-      deconvolveshuffle2d(p->spopts.spread_direction,1.0, p->phiHat1,
-                          p->phiHat2, p->ms, p->mt, (FLT *)fki,
-                          p->nf1, p->nf2, fwi, p->opts.modeord);
+      deconvolveshuffle2d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2, p->ms,
+                          p->mt, (FLT *)fki, p->nf1, p->nf2, fwi, p->opts.modeord);
     else
-      deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1,
-                          p->phiHat2, p->phiHat3, p->ms, p->mt, p->mu,
-                          (FLT *)fki, p->nf1, p->nf2, p->nf3,
-                          fwi, p->opts.modeord);
+      deconvolveshuffle3d(p->spopts.spread_direction, 1.0, p->phiHat1, p->phiHat2,
+                          p->phiHat3, p->ms, p->mt, p->mu, (FLT *)fki, p->nf1, p->nf2,
+                          p->nf3, fwi, p->opts.modeord);
   }
   return 0;
 }
 
-
 // since this func is local only, we macro its name here...
 #ifdef SINGLE
 #define GRIDSIZE_FOR_FFTW gridsize_for_fftwf
@@ -489,21 +500,20 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
 #define GRIDSIZE_FOR_FFTW gridsize_for_fftw
 #endif
 
-int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){
-// local helper func returns a new int array of length dim, extracted from
-// the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
-  int* nf;
-  if(p->dim == 1){ 
-    nf = new int[1];
+int *GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p) {
+  // local helper func returns a new int array of length dim, extracted from
+  // the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
+  int *nf;
+  if (p->dim == 1) {
+    nf    = new int[1];
     nf[0] = (int)p->nf1;
-  }
-  else if (p->dim == 2){ 
-    nf = new int[2];
+  } else if (p->dim == 2) {
+    nf    = new int[2];
     nf[0] = (int)p->nf2;
-    nf[1] = (int)p->nf1; 
-  }   // fftw enforced row major ordering, ie dims are backwards ordered
-  else{ 
-    nf = new int[3];
+    nf[1] = (int)p->nf1;
+  } // fftw enforced row major ordering, ie dims are backwards ordered
+  else {
+    nf    = new int[3];
     nf[0] = (int)p->nf3;
     nf[1] = (int)p->nf2;
     nf[2] = (int)p->nf1;
@@ -511,17 +521,12 @@ int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){
   return nf;
 }
 
-
-  }   // namespace
-}   // namespace
-
-
-
+} // namespace common
+} // namespace finufft
 
 // --------------- rest is the 5 user guru (plan) interface drivers: ---------
 // (not namespaced since have safe names finufft{f}_* )
-using namespace finufft::common;  // accesses routines defined above
-
+using namespace finufft::common; // accesses routines defined above
 
 // Marco Barbone: 5.8.2024
 // These are user-facing.
@@ -540,26 +545,26 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
   o->modeord = 0;
   o->chkbnds = 1;
 
-  o->debug = 0;
+  o->debug        = 0;
   o->spread_debug = 0;
-  o->showwarn = 1;
+  o->showwarn     = 1;
 
-  o->nthreads = 0;
-  o->fftw = FFTW_ESTIMATE; //
-  o->spread_sort = 2;
+  o->nthreads           = 0;
+  o->fftw               = FFTW_ESTIMATE; //
+  o->spread_sort        = 2;
   o->spread_kerevalmeth = 1;
-  o->spread_kerpad = 1;
-  o->upsampfac = 0.0;
-  o->spread_thread = 0;
-  o->maxbatchsize = 0;
+  o->spread_kerpad      = 1;
+  o->upsampfac          = 0.0;
+  o->spread_thread      = 0;
+  o->maxbatchsize       = 0;
   o->spread_nthr_atomic = -1;
   o->spread_max_sp_size = 0;
   // sphinx tag (don't remove): @defopts_end
 }
 
 // PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
-int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
-                     int ntrans, FLT tol, FINUFFT_PLAN *pp, finufft_opts* opts)
+int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, FLT tol,
+                     FINUFFT_PLAN *pp, finufft_opts *opts)
 // Populates the fields of finufft_plan which is pointed to by "pp".
 // opts is ptr to a finufft_opts to set options, or NULL to use defaults.
 // For some of the fields (if "auto" selected) here choose the actual setting.
@@ -567,460 +572,512 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
 // evaluates spreading kernel coefficients, and instantiates the fftw_plan
 {
   FINUFFT_PLAN p;
-  p = new FINUFFT_PLAN_S;                // allocate fresh plan struct
-  *pp = p;                               // pass out plan as ptr to plan struct
+  p   = new FINUFFT_PLAN_S; // allocate fresh plan struct
+  *pp = p;                  // pass out plan as ptr to plan struct
 
-  if (opts==NULL)                        // use default opts
+  if (opts == NULL)         // use default opts
     FINUFFT_DEFAULT_OPTS(&(p->opts));
-  else                                   // or read from what's passed in
-    p->opts = *opts;    // keep a deep copy; changing *opts now has no effect
-
-  if (p->opts.debug)    // do a hello world
-    printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",__func__);
-  
-  if((type!=1)&&(type!=2)&&(type!=3)) {
-    fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n",__func__,type);
+  else                      // or read from what's passed in
+    p->opts = *opts;        // keep a deep copy; changing *opts now has no effect
+
+  if (p->opts.debug)        // do a hello world
+    printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n",
+           __func__);
+
+  if ((type != 1) && (type != 2) && (type != 3)) {
+    fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type);
     return FINUFFT_ERR_TYPE_NOTVALID;
   }
-  if((dim!=1)&&(dim!=2)&&(dim!=3)) {
-    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n",__func__,dim);
+  if ((dim != 1) && (dim != 2) && (dim != 3)) {
+    fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
     return FINUFFT_ERR_DIM_NOTVALID;
   }
-  if (ntrans<1) {
-    fprintf(stderr,"[%s] ntrans (%d) should be at least 1.\n",__func__,ntrans);
+  if (ntrans < 1) {
+    fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans);
     return FINUFFT_ERR_NTRANS_NOTVALID;
   }
-  
+
   // get stuff from args...
-  p->type = type;
-  p->dim = dim;
-  p->ntrans = ntrans;
-  p->tol = tol;
-  p->fftSign = (iflag>=0) ? 1 : -1;         // clean up flag input
+  p->type    = type;
+  p->dim     = dim;
+  p->ntrans  = ntrans;
+  p->tol     = tol;
+  p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input
 
-  // choose overall # threads...
+                                      // choose overall # threads...
 #ifdef _OPENMP
   int ompmaxnthr = MY_OMP_GET_MAX_THREADS();
-  int nthr = ompmaxnthr;                    // default: use as many as OMP gives us
+  int nthr       = ompmaxnthr; // default: use as many as OMP gives us
   // (the above could be set, or suggested set, to 1 for small enough problems...)
-  if (p->opts.nthreads>0) {
-    nthr = p->opts.nthreads;                // user override, now without limit
+  if (p->opts.nthreads > 0) {
+    nthr = p->opts.nthreads; // user override, now without limit
     if (p->opts.showwarn && (nthr > ompmaxnthr))
-      fprintf(stderr,"%s warning: using opts.nthreads=%d, more than the %d OpenMP claims available; note large nthreads can be slower.\n",__func__,nthr,ompmaxnthr);
+      fprintf(stderr,
+              "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims "
+              "available; note large nthreads can be slower.\n",
+              __func__, nthr, ompmaxnthr);
   }
 #else
-  int nthr = 1;                             // always 1 thread (avoid segfault)
-  if (p->opts.nthreads>1)
-    fprintf(stderr,"%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",__func__,p->opts.nthreads);
+  int nthr = 1; // always 1 thread (avoid segfault)
+  if (p->opts.nthreads > 1)
+    fprintf(stderr,
+            "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n",
+            __func__, p->opts.nthreads);
 #endif
-  p->opts.nthreads = nthr;                  // store actual # thr planned for
+  p->opts.nthreads = nthr; // store actual # thr planned for
   // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...)
-  
+
   // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick)
-  if (p->opts.maxbatchsize==0) {            // logic to auto-set best batchsize
-    p->nbatch = 1+(ntrans-1)/nthr;          // min # batches poss
-    p->batchSize = 1+(ntrans-1)/p->nbatch;  // then cut # thr in each b
-  } else {                                  // batchSize override by user
-    p->batchSize = min(p->opts.maxbatchsize,ntrans);
-    p->nbatch = 1+(ntrans-1)/p->batchSize;  // resulting # batches
+  if (p->opts.maxbatchsize == 0) {                  // logic to auto-set best batchsize
+    p->nbatch    = 1 + (ntrans - 1) / nthr;         // min # batches poss
+    p->batchSize = 1 + (ntrans - 1) / p->nbatch;    // then cut # thr in each b
+  } else {                                          // batchSize override by user
+    p->batchSize = min(p->opts.maxbatchsize, ntrans);
+    p->nbatch    = 1 + (ntrans - 1) / p->batchSize; // resulting # batches
   }
-  if (p->opts.spread_thread==0)
-    p->opts.spread_thread=2;                // our auto choice
-  if (p->opts.spread_thread!=1 && p->opts.spread_thread!=2) {
-    fprintf(stderr,"[%s] illegal opts.spread_thread!\n",__func__);
+  if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice
+  if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) {
+    fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__);
     return FINUFFT_ERR_SPREAD_THREAD_NOTVALID;
   }
 
-  if (type!=3) {    // read in user Fourier mode array sizes...
+  if (type != 3) {                      // read in user Fourier mode array sizes...
     p->ms = n_modes[0];
-    p->mt = (dim>1) ? n_modes[1] : 1;       // leave as 1 for unused dims
-    p->mu = (dim>2) ? n_modes[2] : 1;
-    p->N = p->ms*p->mt*p->mu;               // N = total # modes
+    p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims
+    p->mu = (dim > 2) ? n_modes[2] : 1;
+    p->N  = p->ms * p->mt * p->mu;      // N = total # modes
   }
-  
+
   // heuristic to choose default upsampfac... (currently two poss)
-  if (p->opts.upsampfac==0.0) {             // indicates auto-choose
-    p->opts.upsampfac=2.0;                  // default, and need for tol small
-    if (tol>=(FLT)1E-9) {                   // the tol sigma=5/4 can reach
-      if (type==3)                          // could move to setpts, more known?
-        p->opts.upsampfac=1.25;             // faster b/c smaller RAM & FFT
-      else if ((dim==1 && p->N>10000000) || (dim==2 && p->N>300000) || (dim==3 && p->N>3000000))  // type 1,2 heuristic cutoffs, double, typ tol, 12-core xeon
-        p->opts.upsampfac=1.25;
+  if (p->opts.upsampfac == 0.0) {            // indicates auto-choose
+    p->opts.upsampfac = 2.0;                 // default, and need for tol small
+    if (tol >= (FLT)1E-9) {                  // the tol sigma=5/4 can reach
+      if (type == 3)                         // could move to setpts, more known?
+        p->opts.upsampfac = 1.25;            // faster b/c smaller RAM & FFT
+      else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) ||
+               (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double,
+                                             // typ tol, 12-core xeon
+        p->opts.upsampfac = 1.25;
     }
     if (p->opts.debug > 1)
-      printf("[%s] set auto upsampfac=%.2f\n",__func__,p->opts.upsampfac);
+      printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac);
   }
   // use opts to choose and write into plan's spread options...
   int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim);
-  if (ier>1)                                 // proceed if success or warning
+  if (ier > 1) // proceed if success or warning
     return ier;
 
   // set others as defaults (or unallocated for arrays)...
-  p->X = NULL; p->Y = NULL; p->Z = NULL;
-  p->phiHat1 = NULL; p->phiHat2 = NULL; p->phiHat3 = NULL;
-  p->nf1 = 1; p->nf2 = 1; p->nf3 = 1;  // crucial to leave as 1 for unused dims
-  p->sortIndices = NULL;               // used in all three types
-  
+  p->X           = NULL;
+  p->Y           = NULL;
+  p->Z           = NULL;
+  p->phiHat1     = NULL;
+  p->phiHat2     = NULL;
+  p->phiHat3     = NULL;
+  p->nf1         = 1;
+  p->nf2         = 1;
+  p->nf3         = 1;    // crucial to leave as 1 for unused dims
+  p->sortIndices = NULL; // used in all three types
+
   //  ------------------------ types 1,2: planning needed ---------------------
-  if (type==1 || type==2) {
+  if (type == 1 || type == 2) {
 
-    int nthr_fft = nthr;    // give FFTW all threads (or use o.spread_thread?)
-                            // Note: batchSize not used since might be only 1.
+    int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?)
+                         // Note: batchSize not used since might be only 1.
     // Now place FFTW initialization in a lock, courtesy of OMP. Makes FINUFFT
     // thread-safe (can be called inside OMP)
     {
-      static bool did_fftw_init = false;    // the only global state of FINUFFT
+      static bool did_fftw_init = false; // the only global state of FINUFFT
       std::lock_guard<std::mutex> lock(fftw_lock);
       if (!did_fftw_init) {
-        FFTW_INIT();            // setup FFTW global state; should only do once
-        did_fftw_init = true;   // ensure other FINUFFT threads don't clash
+        FFTW_INIT();          // setup FFTW global state; should only do once
+        did_fftw_init = true; // ensure other FINUFFT threads don't clash
       }
     }
 
     p->spopts.spread_direction = type;
 
-    if (p->opts.showwarn) {  // user warn round-off error...
-      if (EPSILON*p->ms>1.0)
-        fprintf(stderr,"%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->ms));
-      if (EPSILON*p->mt>1.0)
-        fprintf(stderr,"%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->mt));
-      if (EPSILON*p->mu>1.0)
-        fprintf(stderr,"%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",__func__,(double)(EPSILON*p->mu));
+    if (p->opts.showwarn) { // user warn round-off error...
+      if (EPSILON * p->ms > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->ms));
+      if (EPSILON * p->mt > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->mt));
+      if (EPSILON * p->mu > 1.0)
+        fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n",
+                __func__, (double)(EPSILON * p->mu));
     }
-    
+
     // determine fine grid sizes, sanity check..
     int nfier = SET_NF_TYPE12(p->ms, p->opts, p->spopts, &(p->nf1));
-    if (nfier) return nfier;    // nf too big; we're done
-    p->phiHat1 = (FLT*)malloc(sizeof(FLT)*(p->nf1/2 + 1));
+    if (nfier) return nfier; // nf too big; we're done
+    p->phiHat1 = (FLT *)malloc(sizeof(FLT) * (p->nf1 / 2 + 1));
     if (dim > 1) {
       nfier = SET_NF_TYPE12(p->mt, p->opts, p->spopts, &(p->nf2));
       if (nfier) return nfier;
-      p->phiHat2 = (FLT*)malloc(sizeof(FLT)*(p->nf2/2 + 1));
+      p->phiHat2 = (FLT *)malloc(sizeof(FLT) * (p->nf2 / 2 + 1));
     }
     if (dim > 2) {
-      nfier = SET_NF_TYPE12(p->mu, p->opts, p->spopts, &(p->nf3)); 
+      nfier = SET_NF_TYPE12(p->mu, p->opts, p->spopts, &(p->nf3));
       if (nfier) return nfier;
-      p->phiHat3 = (FLT*)malloc(sizeof(FLT)*(p->nf3/2 + 1));
+      p->phiHat3 = (FLT *)malloc(sizeof(FLT) * (p->nf3 / 2 + 1));
     }
 
     if (p->opts.debug) { // "long long" here is to avoid warnings with printf...
-      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) (nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d batchSize=%d ", __func__,
-             dim, type, (long long)p->ms,(long long)p->mt,
-             (long long) p->mu, (long long)p->nf1,(long long)p->nf2,
-             (long long)p->nf3, ntrans, nthr, p->batchSize);
-      if (p->batchSize==1)          // spread_thread has no effect in this case
+      printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) "
+             "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n               ntrans=%d nthr=%d "
+             "batchSize=%d ",
+             __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu,
+             (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr,
+             p->batchSize);
+      if (p->batchSize == 1) // spread_thread has no effect in this case
         printf("\n");
       else
         printf(" spread_thread=%d\n", p->opts.spread_thread);
     }
 
     // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim
-    CNTime timer; timer.start();
+    CNTime timer;
+    timer.start();
     onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts);
-    if (dim>1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
-    if (dim>2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
-    if (p->opts.debug) printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n",__func__,p->spopts.nspread, timer.elapsedsec());
+    if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts);
+    if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
+    if (p->opts.debug)
+      printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread,
+             timer.elapsedsec());
 
     timer.restart();
-    p->nf = p->nf1*p->nf2*p->nf3;      // fine grid total number of points
+    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
     if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr, "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__);
+      fprintf(stderr,
+              "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              __func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
 
     p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // the big workspace
-    if (p->opts.debug) printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,(double)1E-09*sizeof(CPX)*p->nf*p->batchSize, timer.elapsedsec());
-    if(!p->fwBatch) {      // we don't catch all such mallocs, just this big one
-      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",__func__);
-      free(p->phiHat1); free(p->phiHat2); free(p->phiHat3);
+    if (p->opts.debug)
+      printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,
+             (double)1E-09 * sizeof(CPX) * p->nf * p->batchSize, timer.elapsedsec());
+    if (!p->fwBatch) { // we don't catch all such mallocs, just this big one
+      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",
+              __func__);
+      free(p->phiHat1);
+      free(p->phiHat2);
+      free(p->phiHat3);
       return FINUFFT_ERR_ALLOC;
     }
-   
-    timer.restart();            // plan the FFTW
+
+    timer.restart(); // plan the FFTW
     int *ns = GRIDSIZE_FOR_FFTW(p);
-    // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, ot, onembed, ostride, odist, sign, flags 
+    // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride,
+    // idist, ot, onembed, ostride, odist, sign, flags
     {
       std::lock_guard<std::mutex> lock(fftw_lock);
 
       // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads.
-      // FIXME: Since this might override what the user wants for fftw, we'd like to set it
-      // just for our one plan and then revert to the user value. Unfortunately
-      // fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and there isn't a convenient
-      // mechanism to probe the version
+      // FIXME: Since this might override what the user wants for fftw, we'd like to
+      // set it just for our one plan and then revert to the user value.
+      // Unfortunately fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and
+      // there isn't a convenient mechanism to probe the version
       FFTW_PLAN_TH(nthr_fft);
-      p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, p->fwBatch, NULL, 1, p->nf,
-                                       p->fftSign, p->opts.fftw);
+      p->fftwPlan =
+          FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf,
+                             p->fwBatch, NULL, 1, p->nf, p->fftSign, p->opts.fftw);
     }
-    if (p->opts.debug) printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__,p->opts.fftw, nthr_fft, timer.elapsedsec());
-    delete []ns;
-    
-  } else {  // -------------------------- type 3 (no planning) ------------
+    if (p->opts.debug)
+      printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw,
+             nthr_fft, timer.elapsedsec());
+    delete[] ns;
+
+  } else { // -------------------------- type 3 (no planning) ------------
 
-    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n",__func__,dim,type,ntrans);
+    if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans);
     // in case destroy occurs before setpts, need safe dummy ptrs/plans...
-    p->CpBatch = NULL;
-    p->fwBatch = NULL;
-    p->Sp = NULL; p->Tp = NULL; p->Up = NULL;
-    p->prephase = NULL;
-    p->deconv = NULL;
+    p->CpBatch     = NULL;
+    p->fwBatch     = NULL;
+    p->Sp          = NULL;
+    p->Tp          = NULL;
+    p->Up          = NULL;
+    p->prephase    = NULL;
+    p->deconv      = NULL;
     p->innerT2plan = NULL;
     // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
     // Note we don't even know nj or nk yet, so can't do anything else!
   }
-  return ier;         // report setup_spreader status (could be warning)
+  return ier; // report setup_spreader status (could be warning)
 }
 
-
 // SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS
-int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
-                   BIGINT nk, FLT* s, FLT* t, FLT* u)
+int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT nk,
+                   FLT *s, FLT *t, FLT *u)
 /* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for
    spreading. (The last 4 arguments are ignored.)
    For type 3: allocates internal working arrays, scales/centers the NU points
    and NU target freqs (stu), evaluates spreading kernel FT at all target freqs.
 */
 {
-  int d = p->dim;     // abbrev for spatial dim
-  CNTime timer; timer.start();
-  p->nj = nj;    // the user only now chooses how many NU (x,y,z) pts
-  if (nj<0) {
-    fprintf(stderr,"[%s] nj (%lld) cannot be negative!\n",__func__,(long long)nj);
+  int d = p->dim; // abbrev for spatial dim
+  CNTime timer;
+  timer.start();
+  p->nj = nj; // the user only now chooses how many NU (x,y,z) pts
+  if (nj < 0) {
+    fprintf(stderr, "[%s] nj (%lld) cannot be negative!\n", __func__, (long long)nj);
     return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-  } else if (nj>MAX_NU_PTS) {
-    fprintf(stderr,"[%s] nj (%lld) exceeds MAX_NU_PTS\n",__func__,(long long)nj);
+  } else if (nj > MAX_NU_PTS) {
+    fprintf(stderr, "[%s] nj (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nj);
     return FINUFFT_ERR_NUM_NU_PTS_INVALID;
   }
-  
-  if (p->type!=3) {  // ------------------ TYPE 1,2 SETPTS -------------------
-                     // (all we can do is check and maybe bin-sort the NU pts)
-    p->X = xj;       // plan must keep pointers to user's fixed NU pts
-    p->Y = yj;
-    p->Z = zj;
+
+  if (p->type != 3) { // ------------------ TYPE 1,2 SETPTS -------------------
+                      // (all we can do is check and maybe bin-sort the NU pts)
+    p->X    = xj;     // plan must keep pointers to user's fixed NU pts
+    p->Y    = yj;
+    p->Z    = zj;
     int ier = spreadcheck(p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
-    if (p->opts.debug>1) printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds, timer.elapsedsec());
-    if (ier)         // no warnings allowed here
-      return ier;    
+    if (p->opts.debug > 1)
+      printf("[%s] spreadcheck (%d):\t%.3g s\n", __func__, p->spopts.chkbnds,
+             timer.elapsedsec());
+    if (ier) // no warnings allowed here
+      return ier;
     timer.restart();
-    // Free sortIndices if it has been allocated before in case of repeated setpts calls causing memory leak.
-    // We don't know it is the same size as before, so we have to malloc each time.
+    // Free sortIndices if it has been allocated before in case of repeated setpts
+    // calls causing memory leak. We don't know it is the same size as before, so we
+    // have to malloc each time.
     if (p->sortIndices) free(p->sortIndices);
-    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT)*p->nj);
+    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
     if (!p->sortIndices) {
-      fprintf(stderr,"[%s] failed to allocate sortIndices!\n",__func__);
+      fprintf(stderr, "[%s] failed to allocate sortIndices!\n", __func__);
       return FINUFFT_ERR_SPREAD_ALLOC;
     }
-    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
-    if (p->opts.debug) printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__,p->didSort, timer.elapsedsec());
+    p->didSort =
+        indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
+    if (p->opts.debug)
+      printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
+             timer.elapsedsec());
 
-    
-  } else {   // ------------------------- TYPE 3 SETPTS -----------------------
-             // (here we can precompute pre/post-phase factors and plan the t2)
+  } else { // ------------------------- TYPE 3 SETPTS -----------------------
+           // (here we can precompute pre/post-phase factors and plan the t2)
 
-    if (nk<0) {
-      fprintf(stderr,"[%s] nk (%lld) cannot be negative!\n",__func__,(long long)nk);
+    if (nk < 0) {
+      fprintf(stderr, "[%s] nk (%lld) cannot be negative!\n", __func__, (long long)nk);
       return FINUFFT_ERR_NUM_NU_PTS_INVALID;
-    } else if (nk>MAX_NU_PTS) {
-      fprintf(stderr,"[%s] nk (%lld) exceeds MAX_NU_PTS\n",__func__,(long long)nk);
+    } else if (nk > MAX_NU_PTS) {
+      fprintf(stderr, "[%s] nk (%lld) exceeds MAX_NU_PTS\n", __func__, (long long)nk);
       return FINUFFT_ERR_NUM_NU_PTS_INVALID;
     }
-    p->nk = nk;     // user set # targ freq pts
-    p->S = s;       // keep pointers to user's input target pts
-    p->T = t;
-    p->U = u;
+    p->nk = nk; // user set # targ freq pts
+    p->S  = s;  // keep pointers to user's input target pts
+    p->T  = t;
+    p->U  = u;
 
     // pick x, s intervals & shifts & # fine grid pts (nf) in each dim...
-    FLT S1,S2,S3;       // get half-width X, center C, which contains {x_j}...
-    arraywidcen(nj,xj,&(p->t3P.X1),&(p->t3P.C1));
-    arraywidcen(nk,s,&S1,&(p->t3P.D1));      // same D, S, but for {s_k}
-    set_nhg_type3(S1,p->t3P.X1,p->opts,p->spopts,
-           &(p->nf1),&(p->t3P.h1),&(p->t3P.gam1));  // applies twist i)
-    p->t3P.C2 = 0.0;        // their defaults if dim 2 unused, etc
+    FLT S1, S2, S3; // get half-width X, center C, which contains {x_j}...
+    arraywidcen(nj, xj, &(p->t3P.X1), &(p->t3P.C1));
+    arraywidcen(nk, s, &S1, &(p->t3P.D1)); // same D, S, but for {s_k}
+    set_nhg_type3(S1, p->t3P.X1, p->opts, p->spopts, &(p->nf1), &(p->t3P.h1),
+                  &(p->t3P.gam1));         // applies twist i)
+    p->t3P.C2 = 0.0;                       // their defaults if dim 2 unused, etc
     p->t3P.D2 = 0.0;
-    if (d>1) {
-      arraywidcen(nj,yj,&(p->t3P.X2),&(p->t3P.C2));     // {y_j}
-      arraywidcen(nk,t,&S2,&(p->t3P.D2));               // {t_k}
-      set_nhg_type3(S2,p->t3P.X2,p->opts,p->spopts,&(p->nf2),
-                    &(p->t3P.h2),&(p->t3P.gam2));
-    }    
+    if (d > 1) {
+      arraywidcen(nj, yj, &(p->t3P.X2), &(p->t3P.C2)); // {y_j}
+      arraywidcen(nk, t, &S2, &(p->t3P.D2));           // {t_k}
+      set_nhg_type3(S2, p->t3P.X2, p->opts, p->spopts, &(p->nf2), &(p->t3P.h2),
+                    &(p->t3P.gam2));
+    }
     p->t3P.C3 = 0.0;
     p->t3P.D3 = 0.0;
-    if (d>2) {
-      arraywidcen(nj,zj,&(p->t3P.X3),&(p->t3P.C3));     // {z_j}
-      arraywidcen(nk,u,&S3,&(p->t3P.D3));               // {u_k}
-      set_nhg_type3(S3,p->t3P.X3,p->opts,p->spopts,
-                    &(p->nf3),&(p->t3P.h3),&(p->t3P.gam3));
+    if (d > 2) {
+      arraywidcen(nj, zj, &(p->t3P.X3), &(p->t3P.C3)); // {z_j}
+      arraywidcen(nk, u, &S3, &(p->t3P.D3));           // {u_k}
+      set_nhg_type3(S3, p->t3P.X3, p->opts, p->spopts, &(p->nf3), &(p->t3P.h3),
+                    &(p->t3P.gam3));
     }
 
-    if (p->opts.debug) {  // report on choices of shifts, centers, etc...
-      printf("\tM=%lld N=%lld\n",(long long)nj,(long long)nk);
-      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", p->t3P.X1, p->t3P.C1,S1, p->t3P.D1, p->t3P.gam1,(long long) p->nf1);
-      if (d>1)
-        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n",p->t3P.X2, p->t3P.C2,S2, p->t3P.D2, p->t3P.gam2,(long long) p->nf2);
-      if (d>2)
-        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", p->t3P.X3, p->t3P.C3,S3, p->t3P.D3, p->t3P.gam3,(long long) p->nf3);
+    if (p->opts.debug) { // report on choices of shifts, centers, etc...
+      printf("\tM=%lld N=%lld\n", (long long)nj, (long long)nk);
+      printf("\tX1=%.3g C1=%.3g S1=%.3g D1=%.3g gam1=%g nf1=%lld\t\n", p->t3P.X1,
+             p->t3P.C1, S1, p->t3P.D1, p->t3P.gam1, (long long)p->nf1);
+      if (d > 1)
+        printf("\tX2=%.3g C2=%.3g S2=%.3g D2=%.3g gam2=%g nf2=%lld\n", p->t3P.X2,
+               p->t3P.C2, S2, p->t3P.D2, p->t3P.gam2, (long long)p->nf2);
+      if (d > 2)
+        printf("\tX3=%.3g C3=%.3g S3=%.3g D3=%.3g gam3=%g nf3=%lld\n", p->t3P.X3,
+               p->t3P.C3, S3, p->t3P.D3, p->t3P.gam3, (long long)p->nf3);
     }
-    p->nf = p->nf1*p->nf2*p->nf3;      // fine grid total number of points
+    p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points
     if (p->nf * p->batchSize > MAX_NF) {
-      fprintf(stderr, "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__);
+      fprintf(stderr,
+              "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",
+              __func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
-    if (p->fwBatch)
-      FFTW_FR(p->fwBatch);
+    if (p->fwBatch) FFTW_FR(p->fwBatch);
     p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // maybe big workspace
 
     // (note FFTW_ALLOC is not needed over malloc, but matches its type)
-    if(p->CpBatch) free(p->CpBatch);
-    p->CpBatch = (CPX*)malloc(sizeof(CPX) * nj*p->batchSize);  // batch c' work
-    if (p->opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, (double)1E-09*sizeof(CPX)*(p->nf+nj)*p->batchSize, timer.elapsedsec());
-    if(!p->fwBatch || !p->CpBatch) {
-      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n",__func__);
-      return FINUFFT_ERR_ALLOC; 
+    if (p->CpBatch) free(p->CpBatch);
+    p->CpBatch = (CPX *)malloc(sizeof(CPX) * nj * p->batchSize); // batch c' work
+    if (p->opts.debug)
+      printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__,
+             (double)1E-09 * sizeof(CPX) * (p->nf + nj) * p->batchSize,
+             timer.elapsedsec());
+    if (!p->fwBatch || !p->CpBatch) {
+      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__);
+      return FINUFFT_ERR_ALLOC;
     }
-    //printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
+    // printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
 
     // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
-    if(p->X) free(p->X);
-    if(p->Sp) free(p->Sp);
-    p->X = (FLT*)malloc(sizeof(FLT)*nj);
-    p->Sp = (FLT*)malloc(sizeof(FLT)*nk);
-    if (d>1) {
-      if(p->Y) free(p->Y);
-      if(p->Tp) free(p->Tp);
-      p->Y = (FLT*)malloc(sizeof(FLT)*nj);
-      p->Tp = (FLT*)malloc(sizeof(FLT)*nk);
+    if (p->X) free(p->X);
+    if (p->Sp) free(p->Sp);
+    p->X  = (FLT *)malloc(sizeof(FLT) * nj);
+    p->Sp = (FLT *)malloc(sizeof(FLT) * nk);
+    if (d > 1) {
+      if (p->Y) free(p->Y);
+      if (p->Tp) free(p->Tp);
+      p->Y  = (FLT *)malloc(sizeof(FLT) * nj);
+      p->Tp = (FLT *)malloc(sizeof(FLT) * nk);
     }
-    if (d>2) {
-      if(p->Z) free(p->Z);
-      if(p->Up) free(p->Up);
-      p->Z = (FLT*)malloc(sizeof(FLT)*nj);
-      p->Up = (FLT*)malloc(sizeof(FLT)*nk);
+    if (d > 2) {
+      if (p->Z) free(p->Z);
+      if (p->Up) free(p->Up);
+      p->Z  = (FLT *)malloc(sizeof(FLT) * nj);
+      p->Up = (FLT *)malloc(sizeof(FLT) * nk);
     }
 
     // always shift as use gam to rescale x_j to x'_j, etc (twist iii)...
-    FLT ig1 = 1.0/p->t3P.gam1, ig2=0.0, ig3=0.0;   // "reciprocal-math" optim
-    if (d>1)
-      ig2 = 1.0/p->t3P.gam2;
-    if (d>2)
-      ig3 = 1.0/p->t3P.gam3;
+    FLT ig1 = 1.0 / p->t3P.gam1, ig2 = 0.0, ig3 = 0.0; // "reciprocal-math" optim
+    if (d > 1) ig2 = 1.0 / p->t3P.gam2;
+    if (d > 2) ig3 = 1.0 / p->t3P.gam3;
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT j=0;j<nj;++j) {
-      p->X[j] = (xj[j] - p->t3P.C1) * ig1;         // rescale x_j
-      if (d>1)        // (ok to do inside loop because of branch predict)
-        p->Y[j] = (yj[j]- p->t3P.C2) * ig2;        // rescale y_j
-      if (d>2)
-        p->Z[j] = (zj[j] - p->t3P.C3) * ig3;       // rescale z_j
+    for (BIGINT j = 0; j < nj; ++j) {
+      p->X[j] = (xj[j] - p->t3P.C1) * ig1; // rescale x_j
+      if (d > 1) // (ok to do inside loop because of branch predict)
+        p->Y[j] = (yj[j] - p->t3P.C2) * ig2;          // rescale y_j
+      if (d > 2) p->Z[j] = (zj[j] - p->t3P.C3) * ig3; // rescale z_j
     }
 
     // set up prephase array...
-    CPX imasign = (p->fftSign>=0) ? IMA : -IMA;             // +-i
-    if(p->prephase) free(p->prephase);
-    p->prephase = (CPX*)malloc(sizeof(CPX)*nj);
-    if (p->t3P.D1!=0.0 || p->t3P.D2!=0.0 || p->t3P.D3!=0.0) {
+    CPX imasign = (p->fftSign >= 0) ? IMA : -IMA; // +-i
+    if (p->prephase) free(p->prephase);
+    p->prephase = (CPX *)malloc(sizeof(CPX) * nj);
+    if (p->t3P.D1 != 0.0 || p->t3P.D2 != 0.0 || p->t3P.D3 != 0.0) {
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-      for (BIGINT j=0;j<nj;++j) {          // ... loop over src NU locs
-        FLT phase = p->t3P.D1*xj[j];
-        if (d>1)
-          phase += p->t3P.D2*yj[j];
-        if (d>2)
-          phase += p->t3P.D3*zj[j];
-        p->prephase[j] = cos(phase)+imasign*sin(phase);   // Euler e^{+-i.phase}
+      for (BIGINT j = 0; j < nj; ++j) { // ... loop over src NU locs
+        FLT phase = p->t3P.D1 * xj[j];
+        if (d > 1) phase += p->t3P.D2 * yj[j];
+        if (d > 2) phase += p->t3P.D3 * zj[j];
+        p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler
+                                                            // e^{+-i.phase}
       }
     } else
-      for (BIGINT j=0;j<nj;++j)
-        p->prephase[j] = (CPX)1.0;     // *** or keep flag so no mult in exec??
-      
-    // rescale the target s_k etc to s'_k etc...
+      for (BIGINT j = 0; j < nj; ++j)
+        p->prephase[j] = (CPX)1.0; // *** or keep flag so no mult in exec??
+
+                                   // rescale the target s_k etc to s'_k etc...
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT k=0;k<nk;++k) {
-      p->Sp[k] = p->t3P.h1*p->t3P.gam1*(s[k]- p->t3P.D1);  // so |s'_k| < pi/R
-      if (d>1)
-        p->Tp[k] = p->t3P.h2*p->t3P.gam2*(t[k]- p->t3P.D2);  // so |t'_k| < pi/R
-      if (d>2)
-        p->Up[k] = p->t3P.h3*p->t3P.gam3*(u[k]- p->t3P.D3);  // so |u'_k| < pi/R
+    for (BIGINT k = 0; k < nk; ++k) {
+      p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1);   // so |s'_k| < pi/R
+      if (d > 1)
+        p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| <
+                                                                 // pi/R
+      if (d > 2)
+        p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| <
+                                                                 // pi/R
     }
-    
+
     // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)...
     // (exploits that FT separates because kernel is prod of 1D funcs)
-    if(p->deconv) free(p->deconv);
-    p->deconv = (CPX*)malloc(sizeof(CPX)*nk);
-    FLT *phiHatk1 = (FLT*)malloc(sizeof(FLT)*nk);  // don't confuse w/ p->phiHat
-    onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts);         // fill phiHat1
+    if (p->deconv) free(p->deconv);
+    p->deconv     = (CPX *)malloc(sizeof(CPX) * nk);
+    FLT *phiHatk1 = (FLT *)malloc(sizeof(FLT) * nk);    // don't confuse w/ p->phiHat
+    onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts); // fill phiHat1
     FLT *phiHatk2 = NULL, *phiHatk3 = NULL;
-    if (d>1) {
-      phiHatk2 = (FLT*)malloc(sizeof(FLT)*nk);
-      onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts);       // fill phiHat2
+    if (d > 1) {
+      phiHatk2 = (FLT *)malloc(sizeof(FLT) * nk);
+      onedim_nuft_kernel(nk, p->Tp, phiHatk2, p->spopts); // fill phiHat2
     }
-    if (d>2) {
-      phiHatk3 = (FLT*)malloc(sizeof(FLT)*nk);
-      onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts);       // fill phiHat3
+    if (d > 2) {
+      phiHatk3 = (FLT *)malloc(sizeof(FLT) * nk);
+      onedim_nuft_kernel(nk, p->Up, phiHatk3, p->spopts); // fill phiHat3
     }
-    int Cfinite = isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3);    // C can be nan or inf if M=0, no input NU pts
-    int Cnonzero = p->t3P.C1!=0.0 || p->t3P.C2!=0.0 || p->t3P.C3!=0.0;  // cen
+    int Cfinite =
+        isfinite(p->t3P.C1) && isfinite(p->t3P.C2) && isfinite(p->t3P.C3); // C can be nan
+                                                                           // or inf if
+                                                                           // M=0, no
+                                                                           // input NU pts
+    int Cnonzero = p->t3P.C1 != 0.0 || p->t3P.C2 != 0.0 || p->t3P.C3 != 0.0; // cen
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
-    for (BIGINT k=0;k<nk;++k) {         // .... loop over NU targ freqs
+    for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs
       FLT phiHat = phiHatk1[k];
-      if (d>1)
-        phiHat *= phiHatk2[k];
-      if (d>2)
-        phiHat *= phiHatk3[k];
+      if (d > 1) phiHat *= phiHatk2[k];
+      if (d > 2) phiHat *= phiHatk3[k];
       p->deconv[k] = (CPX)(1.0 / phiHat);
       if (Cfinite && Cnonzero) {
         FLT phase = (s[k] - p->t3P.D1) * p->t3P.C1;
-        if (d>1)
-          phase += (t[k] - p->t3P.D2) * p->t3P.C2;
-        if (d>2)
-          phase += (u[k] - p->t3P.D3) * p->t3P.C3;
-        p->deconv[k] *= cos(phase)+imasign*sin(phase);   // Euler e^{+-i.phase}
+        if (d > 1) phase += (t[k] - p->t3P.D2) * p->t3P.C2;
+        if (d > 2) phase += (u[k] - p->t3P.D3) * p->t3P.C3;
+        p->deconv[k] *= cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase}
       }
     }
-    free(phiHatk1); free(phiHatk2); free(phiHatk3);  // done w/ deconv fill
-    if (p->opts.debug) printf("[%s t3] phase & deconv factors:\t%.3g s\n",__func__,timer.elapsedsec());
+    free(phiHatk1);
+    free(phiHatk2);
+    free(phiHatk3); // done w/ deconv fill
+    if (p->opts.debug)
+      printf("[%s t3] phase & deconv factors:\t%.3g s\n", __func__, timer.elapsedsec());
 
     // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw...
     timer.restart();
-    // Free sortIndices if it has been allocated before in case of repeated setpts calls causing memory leak.
-    // We don't know it is the same size as before, so we have to malloc each time.
+    // Free sortIndices if it has been allocated before in case of repeated setpts
+    // calls causing memory leak. We don't know it is the same size as before, so we
+    // have to malloc each time.
     if (p->sortIndices) free(p->sortIndices);
-    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT)*p->nj);
+    p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj);
     if (!p->sortIndices) {
-      fprintf(stderr,"[%s t3] failed to allocate sortIndices!\n",__func__);
+      fprintf(stderr, "[%s t3] failed to allocate sortIndices!\n", __func__);
       return FINUFFT_ERR_SPREAD_ALLOC;
     }
-    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, p->Z, p->spopts);
-    if (p->opts.debug) printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n",__func__, p->didSort, timer.elapsedsec());
- 
+    p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y,
+                           p->Z, p->spopts);
+    if (p->opts.debug)
+      printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n", __func__, p->didSort,
+             timer.elapsedsec());
+
     // Plan and setpts once, for the (repeated) inner type 2 finufft call...
     timer.restart();
-    BIGINT t2nmodes[] = {p->nf1,p->nf2,p->nf3};   // t2 input is actually fw
-    finufft_opts t2opts = p->opts;                  // deep copy, since not ptrs
-    t2opts.modeord = 0;                           // needed for correct t3!
-    t2opts.debug = max(0,p->opts.debug-1);        // don't print as much detail
-    t2opts.spread_debug = max(0,p->opts.spread_debug-1);
-    t2opts.showwarn = 0;                          // so don't see warnings 2x
+    BIGINT t2nmodes[]   = {p->nf1, p->nf2, p->nf3};  // t2 input is actually fw
+    finufft_opts t2opts = p->opts;                   // deep copy, since not ptrs
+    t2opts.modeord      = 0;                         // needed for correct t3!
+    t2opts.debug        = max(0, p->opts.debug - 1); // don't print as much detail
+    t2opts.spread_debug = max(0, p->opts.spread_debug - 1);
+    t2opts.showwarn     = 0;                         // so don't see warnings 2x
     // (...could vary other t2opts here?)
-    if(p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan);
+    if (p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan);
     int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol,
                                &p->innerT2plan, &t2opts);
-    if (ier>1) {     // if merely warning, still proceed
-      fprintf(stderr,"[%s t3]: inner type 2 plan creation failed with ier=%d!\n",__func__,ier);
+    if (ier > 1) { // if merely warning, still proceed
+      fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n",
+              __func__, ier);
       return ier;
     }
-    ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL, NULL);  // note nk = # output points (not nj)
-    if (ier>1) {
-      fprintf(stderr,"[%s t3]: inner type 2 setpts failed, ier=%d!\n",__func__,ier);
+    ier = FINUFFT_SETPTS(p->innerT2plan, nk, p->Sp, p->Tp, p->Up, 0, NULL, NULL,
+                         NULL); // note nk = # output points (not nj)
+    if (ier > 1) {
+      fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier);
       return ier;
     }
-    if (p->opts.debug) printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__,timer.elapsedsec());
-
+    if (p->opts.debug)
+      printf("[%s t3] inner t2 plan & setpts: \t%.3g s\n", __func__, timer.elapsedsec());
   }
   return 0;
 }
 // ............ end setpts ..................................................
 
-
 // EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
-int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){
-/* See ../docs/cguru.doc for current documentation.
+int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) {
+  /* See ../docs/cguru.doc for current documentation.
 
    For given (stack of) weights cj or coefficients fk, performs NUFFTs with
    existing (sorted) NU pts and existing plan.
@@ -1032,131 +1089,138 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){
    Return value 0 (no error diagnosis yet).
    Barnett 5/20/20, based on Malleo 2019.
 */
-  CNTime timer; timer.start();
-  
-  if (p->type!=3){ // --------------------- TYPE 1,2 EXEC ------------------
-  
-    double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0;  // accumulated timing
+  CNTime timer;
+  timer.start();
+
+  if (p->type != 3) { // --------------------- TYPE 1,2 EXEC ------------------
+
+    double t_sprint = 0.0, t_fft = 0.0, t_deconv = 0.0; // accumulated timing
     if (p->opts.debug)
-      printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, p->nbatch, p->batchSize);
-    
-    for (int b=0; b*p->batchSize < p->ntrans; b++) { // .....loop b over batches
+      printf("[%s] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
+             p->nbatch, p->batchSize);
+
+    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
 
       // current batch is either batchSize, or possibly truncated if last one
-      int thisBatchSize = min(p->ntrans - b*p->batchSize, p->batchSize);
-      int bB = b*p->batchSize;         // index of vector, since batchsizes same
-      CPX* cjb = cj + bB*p->nj;        // point to batch of weights
-      CPX* fkb = fk + bB*p->N;         // point to batch of mode coeffs
-      if (p->opts.debug>1) printf("[%s] start batch %d (size %d):\n",__func__, b,thisBatchSize);
-      
+      int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize);
+      int bB            = b * p->batchSize; // index of vector, since batchsizes same
+      CPX *cjb          = cj + bB * p->nj;  // point to batch of weights
+      CPX *fkb          = fk + bB * p->N;   // point to batch of mode coeffs
+      if (p->opts.debug > 1)
+        printf("[%s] start batch %d (size %d):\n", __func__, b, thisBatchSize);
+
       // STEP 1: (varies by type)
       timer.restart();
-      if (p->type == 1) {  // type 1: spread NU pts p->X, weights cj, to fw grid
+      if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid
         spreadinterpSortedBatch(thisBatchSize, p, cjb);
         t_sprint += timer.elapsedsec();
-      } else {          //  type 2: amplify Fourier coeffs fk into 0-padded fw
+      } else { //  type 2: amplify Fourier coeffs fk into 0-padded fw
         deconvolveBatch(thisBatchSize, p, fkb);
         t_deconv += timer.elapsedsec();
       }
-             
+
       // STEP 2: call the pre-planned FFT on this batch
       timer.restart();
-      FFTW_EX(p->fftwPlan);   // if thisBatchSize<batchSize it wastes some flops
+      FFTW_EX(p->fftwPlan); // if thisBatchSize<batchSize it wastes some flops
       t_fft += timer.elapsedsec();
-      if (p->opts.debug>1)
-        printf("\tFFTW exec:\t\t%.3g s\n", timer.elapsedsec());
-      
+      if (p->opts.debug > 1) printf("\tFFTW exec:\t\t%.3g s\n", timer.elapsedsec());
+
       // STEP 3: (varies by type)
-      timer.restart();        
-      if (p->type == 1) {   // type 1: deconvolve (amplify) fw and shuffle to fk
+      timer.restart();
+      if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk
         deconvolveBatch(thisBatchSize, p, fkb);
         t_deconv += timer.elapsedsec();
-      } else {          // type 2: interpolate unif fw grid to NU target pts
+      } else { // type 2: interpolate unif fw grid to NU target pts
         spreadinterpSortedBatch(thisBatchSize, p, cjb);
-        t_sprint += timer.elapsedsec(); 
+        t_sprint += timer.elapsedsec();
       }
-    }                                                   // ........end b loop
-    
-    if (p->opts.debug) {  // report total times in their natural order...
-      if(p->type == 1) {
-        printf("[%s] done. tot spread:\t\t%.3g s\n",__func__,t_sprint);
+    } // ........end b loop
+
+    if (p->opts.debug) { // report total times in their natural order...
+      if (p->type == 1) {
+        printf("[%s] done. tot spread:\t\t%.3g s\n", __func__, t_sprint);
         printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
         printf("               tot deconvolve:\t\t\t%.3g s\n", t_deconv);
       } else {
-        printf("[%s] done. tot deconvolve:\t\t%.3g s\n",__func__,t_deconv);
+        printf("[%s] done. tot deconvolve:\t\t%.3g s\n", __func__, t_deconv);
         printf("               tot FFT:\t\t\t\t%.3g s\n", t_fft);
-        printf("               tot interp:\t\t\t%.3g s\n",t_sprint);
+        printf("               tot interp:\t\t\t%.3g s\n", t_sprint);
       }
     }
   }
 
-  else {  // ----------------------------- TYPE 3 EXEC ---------------------
+  else { // ----------------------------- TYPE 3 EXEC ---------------------
 
-    //for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long int)j,(double)real(cj[j]),(double)imag(cj[j]));  // debug
-    
-    double t_pre=0.0, t_spr=0.0, t_t2=0.0, t_deconv=0.0;  // accumulated timings
+    // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long
+    // int)j,(double)real(cj[j]),(double)imag(cj[j]));  // debug
+
+    double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0,
+           t_deconv = 0.0; // accumulated timings
     if (p->opts.debug)
-      printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n",__func__,p->ntrans, p->nbatch, p->batchSize);
+      printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans,
+             p->nbatch, p->batchSize);
 
-    for (int b=0; b*p->batchSize < p->ntrans; b++) { // .....loop b over batches
+    for (int b = 0; b * p->batchSize < p->ntrans; b++) { // .....loop b over batches
 
       // batching and pointers to this batch, identical to t1,2 above...
-      int thisBatchSize = min(p->ntrans - b*p->batchSize, p->batchSize);
-      int bB = b*p->batchSize;
-      CPX* cjb = cj + bB*p->nj;           // batch of input strengths
-      CPX* fkb = fk + bB*p->nk;           // batch of output strengths
-      if (p->opts.debug>1) printf("[%s t3] start batch %d (size %d):\n",__func__,b,thisBatchSize);
-      
+      int thisBatchSize = min(p->ntrans - b * p->batchSize, p->batchSize);
+      int bB            = b * p->batchSize;
+      CPX *cjb          = cj + bB * p->nj; // batch of input strengths
+      CPX *fkb          = fk + bB * p->nk; // batch of output strengths
+      if (p->opts.debug > 1)
+        printf("[%s t3] start batch %d (size %d):\n", __func__, b, thisBatchSize);
+
       // STEP 0: pre-phase (possibly) the c_j input strengths into c'_j batch...
       timer.restart();
-#pragma omp parallel for num_threads(p->opts.nthreads)   // or p->batchSize?
-      for (int i=0; i<thisBatchSize; i++) {
-        BIGINT ioff = i*p->nj;
-        for (BIGINT j=0;j<p->nj;++j)
-          p->CpBatch[ioff+j] = p->prephase[j] * cjb[ioff+j];
+#pragma omp parallel for num_threads(p->opts.nthreads) // or p->batchSize?
+      for (int i = 0; i < thisBatchSize; i++) {
+        BIGINT ioff = i * p->nj;
+        for (BIGINT j = 0; j < p->nj; ++j)
+          p->CpBatch[ioff + j] = p->prephase[j] * cjb[ioff + j];
       }
-      t_pre += timer.elapsedsec(); 
-      
+      t_pre += timer.elapsedsec();
+
       // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid...
       timer.restart();
-      p->spopts.spread_direction = 1;                         // spread
-      spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch);  // p->X are primed
+      p->spopts.spread_direction = 1;                        // spread
+      spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed
       t_spr += timer.elapsedsec();
 
-      //for (int j=0;j<p->nf1;++j) printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]);  // debug
-   
+      // for (int j=0;j<p->nf1;++j)
+      // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]);  //
+      // debug
+
       // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
       timer.restart();
       // illegal possible shrink of ntrans *after* plan for smaller last batch:
-      p->innerT2plan->ntrans = thisBatchSize;      // do not try this at home!
+      p->innerT2plan->ntrans = thisBatchSize; // do not try this at home!
       /* (alarming that FFTW not shrunk, but safe, because t2's fwBatch array
-         still the same size, as Andrea explained; just wastes a few flops) */
-      FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX*)(p->fwBatch));
+     still the same size, as Andrea explained; just wastes a few flops) */
+      FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX *)(p->fwBatch));
       t_t2 += timer.elapsedsec();
 
       // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
       timer.restart();
 #pragma omp parallel for num_threads(p->opts.nthreads)
-      for (int i=0; i<thisBatchSize; i++) {
-        BIGINT ioff = i*p->nk;
-        for (BIGINT k=0;k<p->nk;++k)
-          fkb[ioff+k] *= p->deconv[k];
+      for (int i = 0; i < thisBatchSize; i++) {
+        BIGINT ioff = i * p->nk;
+        for (BIGINT k = 0; k < p->nk; ++k) fkb[ioff + k] *= p->deconv[k];
       }
       t_deconv += timer.elapsedsec();
-    }                                                   // ........end b loop
+    } // ........end b loop
 
-    if (p->opts.debug) {  // report total times in their natural order...
-      printf("[%s t3] done. tot prephase:\t\t%.3g s\n",__func__,t_pre);
-      printf("                  tot spread:\t\t\t%.3g s\n",t_spr);
+    if (p->opts.debug) { // report total times in their natural order...
+      printf("[%s t3] done. tot prephase:\t\t%.3g s\n", __func__, t_pre);
+      printf("                  tot spread:\t\t\t%.3g s\n", t_spr);
       printf("                  tot type 2:\t\t\t%.3g s\n", t_t2);
       printf("                  tot deconvolve:\t\t%.3g s\n", t_deconv);
-    }    
+    }
   }
-  //for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
-  
-  return 0; 
-}
+  // for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long
+  // int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
 
+  return 0;
+}
 
 // DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
 int FINUFFT_DESTROY(FINUFFT_PLAN p)
@@ -1165,12 +1229,12 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
 // Thus either each thing free'd here is guaranteed to be NULL or correctly
 // allocated.
 {
-  if (!p)                // NULL ptr, so not a ptr to a plan, report error
+  if (!p) // NULL ptr, so not a ptr to a plan, report error
     return 1;
 
   FFTW_FR(p->fwBatch); // free the big FFTW (or t3 spread) working array
   free(p->sortIndices);
-  if (p->type==1 || p->type==2) {
+  if (p->type == 1 || p->type == 2) {
     {
       std::lock_guard<std::mutex> lock(fftw_lock);
       FFTW_DE(p->fftwPlan);
@@ -1178,14 +1242,18 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
     free(p->phiHat1);
     free(p->phiHat2);
     free(p->phiHat3);
-  } else {               // free the stuff alloc for type 3 only
-    FINUFFT_DESTROY(p->innerT2plan);   // if NULL, ignore its error code
+  } else {                           // free the stuff alloc for type 3 only
+    FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code
     free(p->CpBatch);
-    free(p->Sp); free(p->Tp); free(p->Up);
-    free(p->X); free(p->Y); free(p->Z);
+    free(p->Sp);
+    free(p->Tp);
+    free(p->Up);
+    free(p->X);
+    free(p->Y);
+    free(p->Z);
     free(p->prephase);
     free(p->deconv);
   }
   delete p;
-  return 0;              // success
+  return 0; // success
 }
diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp
index e07e76c02..edd25adfb 100644
--- a/src/simpleinterfaces.cpp
+++ b/src/simpleinterfaces.cpp
@@ -1,8 +1,8 @@
 // public header
 #include <finufft.h>
 // private headers
-#include <finufft/defs.h>
 #include <cstdio>
+#include <finufft/defs.h>
 using namespace std;
 
 /* ---------------------------------------------------------------------------
@@ -18,281 +18,274 @@ using namespace std;
    ---------------------------------------------------------------------------
 */
 
-
 // Helper layer ...........................................................
 
 namespace finufft {
-  namespace common {
+namespace common {
 
-int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT* xj,
-                        FLT *yj, FLT *zj, CPX* cj,int iflag, FLT eps,
-                        BIGINT *n_modes, BIGINT nk, FLT *s, FLT *t,  FLT *u,
-                        CPX* fk, finufft_opts *popts)
+int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, FLT *yj,
+                        FLT *zj, CPX *cj, int iflag, FLT eps, BIGINT *n_modes, BIGINT nk,
+                        FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *popts)
 // Helper layer between simple interfaces (with opts) and the guru functions.
 // Author: Andrea Malleo, 2019.
 {
   FINUFFT_PLAN plan;
-  int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps,
-                             &plan, popts);  // popts (ptr to opts) can be NULL
-  if (ier>1) {   // since 1 (a warning) still allows proceeding...
+  int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, &plan,
+                             popts); // popts (ptr to opts) can be NULL
+  if (ier > 1) {                     // since 1 (a warning) still allows proceeding...
     fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier);
     delete plan;
     return ier;
   }
 
   int ier2 = FINUFFT_SETPTS(plan, nj, xj, yj, zj, nk, s, t, u);
-  if (ier2>1) {
-    fprintf(stderr,"FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2);
+  if (ier2 > 1) {
+    fprintf(stderr, "FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2);
     FINUFFT_DESTROY(plan);
     return ier2;
   }
 
   int ier3 = FINUFFT_EXECUTE(plan, cj, fk);
-  if (ier3>1) {
-    fprintf(stderr,"FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3);
+  if (ier3 > 1) {
+    fprintf(stderr, "FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3);
     FINUFFT_DESTROY(plan);
     return ier3;
   }
 
   FINUFFT_DESTROY(plan);
-  return max(max(ier,ier2),ier3);  // in case any one gave a (positive!) warning
+  return max(max(ier, ier2), ier3); // in case any one gave a (positive!) warning
 }
 
-  }       // namespace
-}       // namespace
+} // namespace common
+} // namespace finufft
 
 using namespace finufft::common;
 
-
 // Dimension 1111111111111111111111111111111111111111111111111111111111111111
 
-int FINUFFT1D1(BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT1D1(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk,
+               finufft_opts *opts)
 //  Type-1 1D complex nonuniform FFT. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,1,1};
-  int n_dims = 1;
-  int n_transf = 1;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-			 iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, 1, 1};
+  int n_dims       = 1;
+  int n_transf     = 1;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D1MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,
-                   BIGINT ms, CPX* fk, finufft_opts *opts)
+int FINUFFT1D1MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
+                   BIGINT ms, CPX *fk, finufft_opts *opts)
 // Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,1,1};
-  int n_dims = 1;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-		      iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, 1, 1};
+  int n_dims       = 1;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D2(BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT1D2(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT ms, CPX *fk,
+               finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,1,1};
-  int n_dims = 1;
-  int n_transf = 1;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-			  iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, 1, 1};
+  int n_dims       = 1;
+  int n_transf     = 1;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D2MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag,FLT eps,BIGINT ms,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT1D2MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
+                   BIGINT ms, CPX *fk, finufft_opts *opts)
 //  Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,1,1};
-  int n_dims = 1;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-		      	iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, 1, 1};
+  int n_dims       = 1;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D3(BIGINT nj,FLT* xj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, CPX* fk, finufft_opts *opts)
+int FINUFFT1D3(BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps, BIGINT nk, FLT *s,
+               CPX *fk, finufft_opts *opts)
 // Type-3 1D complex nonuniform FFT. See ../docs/usage.rst
 {
-  int n_dims = 1;
+  int n_dims   = 1;
   int n_transf = 1;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-				iflag, eps, NULL, nk, s, NULL, NULL, fk, opts);
+  int type     = 3;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                eps, NULL, nk, s, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT1D3MANY(int n_transf, BIGINT nj,FLT* xj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, CPX* fk, finufft_opts *opts)
-  // Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
+int FINUFFT1D3MANY(int n_transf, BIGINT nj, FLT *xj, CPX *cj, int iflag, FLT eps,
+                   BIGINT nk, FLT *s, CPX *fk, finufft_opts *opts)
+// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
   int n_dims = 1;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj,
-				iflag, eps, NULL, nk, s, NULL, NULL, fk, opts);
+  int type   = 3;
+  int ier    = invokeGuruInterface(n_dims, type, n_transf, nj, xj, NULL, NULL, cj, iflag,
+                                   eps, NULL, nk, s, NULL, NULL, fk, opts);
   return ier;
 }
 
-
 // Dimension 22222222222222222222222222222222222222222222222222222222222222222
 
-int FINUFFT2D1(BIGINT nj,FLT* xj,FLT *yj,CPX* cj,int iflag,
-	       FLT eps, BIGINT ms, BIGINT mt, CPX* fk, finufft_opts* opts)
+int FINUFFT2D1(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms,
+               BIGINT mt, CPX *fk, finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,1};
-  int n_dims = 2;
-  int n_transf = 1;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj,
-                          iflag, eps, n_modes, 0, NULL, NULL, NULL,fk, opts);
+  BIGINT n_modes[] = {ms, mt, 1};
+  int n_dims       = 2;
+  int n_transf     = 1;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT* xj, FLT *yj, CPX* c,
-		   int iflag, FLT eps, BIGINT ms, BIGINT mt, CPX* fk,
-		   finufft_opts *opts)
+int FINUFFT2D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps,
+                   BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts)
 //  Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,1};
-  int n_dims = 2;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj,NULL, c,
-                        iflag, eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, 1};
+  int n_dims       = 2;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D2(BIGINT nj,FLT* xj,FLT *yj,CPX* cj,int iflag,FLT eps,
-	       BIGINT ms, BIGINT mt, CPX* fk, finufft_opts *opts)
+int FINUFFT2D2(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT ms,
+               BIGINT mt, CPX *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT.  See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,1};
-  int n_dims = 2;
-  int n_transf = 1;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, 1};
+  int n_dims       = 2;
+  int n_transf     = 1;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT* xj, FLT *yj, CPX* c, int iflag,
-		   FLT eps, BIGINT ms, BIGINT mt, CPX* fk, finufft_opts *opts)
+int FINUFFT2D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *c, int iflag, FLT eps,
+                   BIGINT ms, BIGINT mt, CPX *fk, finufft_opts *opts)
 //  Type-2 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,1};
-  int n_dims = 2;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, 1};
+  int n_dims       = 2;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, c, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D3(BIGINT nj,FLT* xj,FLT* yj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t, CPX* fk, finufft_opts *opts)
+int FINUFFT2D3(BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps, BIGINT nk,
+               FLT *s, FLT *t, CPX *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT.  See ../docs/usage.rst
 {
-  int n_dims = 2;
-  int type = 3;
+  int n_dims   = 2;
+  int type     = 3;
   int n_transf = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj,iflag, eps, NULL, nk, s,t,NULL, fk, opts);
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
+                                NULL, nk, s, t, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT2D3MANY(int n_transf, BIGINT nj,FLT* xj,FLT* yj,CPX* cj,int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t, CPX* fk, finufft_opts *opts)
+int FINUFFT2D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, CPX *cj, int iflag, FLT eps,
+                   BIGINT nk, FLT *s, FLT *t, CPX *fk, finufft_opts *opts)
 // Type-3 2D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
   int n_dims = 2;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj,iflag, eps, NULL, nk, s,t,NULL, fk, opts);
+  int type   = 3;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, NULL, cj, iflag, eps,
+                                NULL, nk, s, t, NULL, fk, opts);
   return ier;
 }
 
-
-
 // Dimension 3333333333333333333333333333333333333333333333333333333333333333
 
-int FINUFFT3D1(BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,int iflag,
-	       FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX* fk,
-	       finufft_opts *opts)
+int FINUFFT3D1(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
+               BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
 //  Type-1 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,mu};
-  int n_dims = 3;
-  int n_transf = 1;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, mu};
+  int n_dims       = 3;
+  int n_transf     = 1;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-
-int FINUFFT3D1MANY(int n_transf, BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,
-                   int iflag, FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX* fk,
-                   finufft_opts *opts)
+int FINUFFT3D1MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
+                   FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
 // Type-1 3D complex nonuniform FFT, many vectors.  See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,mu};
-  int n_dims = 3;
-  int type = 1;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, n_modes, 0,  NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, mu};
+  int n_dims       = 3;
+  int type         = 1;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT3D2(BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,
-	       int iflag,FLT eps, BIGINT ms, BIGINT mt, BIGINT mu,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT3D2(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
+               BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,mu};
-  int n_dims = 3;
-  int n_transf = 1;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, mu};
+  int n_dims       = 3;
+  int n_transf     = 1;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT3D2MANY(int n_transf, BIGINT nj,FLT* xj,FLT *yj,FLT *zj,CPX* cj,
-	       int iflag,FLT eps, BIGINT ms, BIGINT mt, BIGINT mu,
-	       CPX* fk, finufft_opts *opts)
+int FINUFFT3D2MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
+                   FLT eps, BIGINT ms, BIGINT mt, BIGINT mu, CPX *fk, finufft_opts *opts)
 // Type-2 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
-  BIGINT n_modes[]={ms,mt,mu};
-  n_modes[0] = ms;
-  n_modes[1] = mt;
-  n_modes[2] = mu;
-  int n_dims = 3;
-  int type = 2;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, n_modes, 0, NULL, NULL, NULL, fk, opts);
+  BIGINT n_modes[] = {ms, mt, mu};
+  n_modes[0]       = ms;
+  n_modes[1]       = mt;
+  n_modes[2]       = mu;
+  int n_dims       = 3;
+  int type         = 2;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                n_modes, 0, NULL, NULL, NULL, fk, opts);
   return ier;
 }
 
-int FINUFFT3D3(BIGINT nj,FLT* xj,FLT* yj,FLT *zj, CPX* cj,
-	       int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t,
-	       FLT *u, CPX* fk, finufft_opts *opts)
+int FINUFFT3D3(BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag, FLT eps,
+               BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk, finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT.   See ../docs/usage.rst
 {
-  int n_dims = 3;
+  int n_dims   = 3;
   int n_transf = 1;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, NULL, nk, s ,t ,u, fk, opts);
+  int type     = 3;
+  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                NULL, nk, s, t, u, fk, opts);
   return ier;
 }
 
-int FINUFFT3D3MANY(int n_transf, BIGINT nj,FLT* xj,FLT* yj,FLT *zj, CPX* cj,
-	       int iflag, FLT eps, BIGINT nk, FLT* s, FLT *t,
-	       FLT *u, CPX* fk, finufft_opts *opts)
+int FINUFFT3D3MANY(int n_transf, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, CPX *cj, int iflag,
+                   FLT eps, BIGINT nk, FLT *s, FLT *t, FLT *u, CPX *fk,
+                   finufft_opts *opts)
 //  Type-3 3D complex nonuniform FFT, many vectors.   See ../docs/usage.rst
 {
   int n_dims = 3;
-  int type = 3;
-  int ier = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag,
-				eps, NULL, nk, s ,t ,u, fk, opts);
+  int type   = 3;
+  int ier    = invokeGuruInterface(n_dims, type, n_transf, nj, xj, yj, zj, cj, iflag, eps,
+                                   NULL, nk, s, t, u, fk, opts);
   return ier;
 }
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 5529108e7..e6f8eaba9 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -1,65 +1,62 @@
 // Spreading/interpolating module within FINUFFT. Uses precision-switching
 // macros for FLT, CPX, etc.
 
-#include <finufft/spreadinterp.h>
 #include <finufft/defs.h>
+#include <finufft/spreadinterp.h>
 #include <finufft/utils.h>
 #include <finufft/utils_precindep.h>
 
-#include <stdlib.h>
-#include <vector>
 #include <math.h>
 #include <stdio.h>
-
+#include <stdlib.h>
+#include <vector>
 
 using namespace std;
-using namespace finufft::utils;              // access to timer
+using namespace finufft::utils; // access to timer
 
 namespace finufft {
-  namespace spreadinterp {
+namespace spreadinterp {
 
 // declarations of purely internal functions... (thus need not be in .h)
 static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, BIGINT N) noexcept;
-static FINUFFT_ALWAYS_INLINE void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts& opts);
-static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts& opts, const int N);
-static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(FLT *ker, FLT z, const int w, const finufft_spread_opts &opts);
-void interp_line(FLT *out,FLT *du, FLT *ker,BIGINT i1,BIGINT N1,int ns);
-void interp_square(FLT *out,FLT *du, FLT *ker1, FLT *ker2, BIGINT i1,BIGINT i2,BIGINT N1,BIGINT N2,int ns);
-void interp_cube(FLT *out,FLT *du, FLT *ker1, FLT *ker2, FLT *ker3,
-		 BIGINT i1,BIGINT i2,BIGINT i3,BIGINT N1,BIGINT N2,BIGINT N3,int ns);
-void spread_subproblem_1d(BIGINT off1, BIGINT size1,FLT *du0,BIGINT M0,FLT *kx0,
-                          FLT *dd0,const finufft_spread_opts& opts);
-void spread_subproblem_2d(BIGINT off1, BIGINT off2, BIGINT size1,BIGINT size2,
-                          FLT *du0,BIGINT M0,
-			  FLT *kx0,FLT *ky0,FLT *dd0,const finufft_spread_opts& opts);
-void spread_subproblem_3d(BIGINT off1,BIGINT off2, BIGINT off3, BIGINT size1,
-                          BIGINT size2,BIGINT size3,FLT *du0,BIGINT M0,
-			  FLT *kx0,FLT *ky0,FLT *kz0,FLT *dd0,
-			  const finufft_spread_opts& opts);
-void add_wrapped_subgrid(BIGINT offset1,BIGINT offset2,BIGINT offset3,
-			 BIGINT size1,BIGINT size2,BIGINT size3,BIGINT N1,
-			 BIGINT N2,BIGINT N3,FLT *data_uniform, FLT *du0);
-void add_wrapped_subgrid_thread_safe(BIGINT offset1,BIGINT offset2,BIGINT offset3,
-                                     BIGINT size1,BIGINT size2,BIGINT size3,BIGINT N1,
-                                     BIGINT N2,BIGINT N3,FLT *data_uniform, FLT *du0);
-void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-	      BIGINT N1,BIGINT N2,BIGINT N3,
-	      double bin_size_x,double bin_size_y,double bin_size_z, int debug);
-void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-	      BIGINT N1,BIGINT N2,BIGINT N3,
-              double bin_size_x,double bin_size_y,double bin_size_z, int debug,
-              int nthr);
-void get_subgrid(BIGINT &offset1,BIGINT &offset2,BIGINT &offset3,BIGINT &size1,
-		 BIGINT &size2,BIGINT &size3,BIGINT M0,FLT* kx0,FLT* ky0,
-		 FLT* kz0,int ns, int ndims);
-
-
+static FINUFFT_ALWAYS_INLINE void set_kernel_args(FLT *args, FLT x,
+                                                  const finufft_spread_opts &opts);
+static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(
+    FLT *ker, FLT *args, const finufft_spread_opts &opts, const int N);
+static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(FLT *ker, FLT z, const int w,
+                                                         const finufft_spread_opts &opts);
+void interp_line(FLT *out, FLT *du, FLT *ker, BIGINT i1, BIGINT N1, int ns);
+void interp_square(FLT *out, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2,
+                   BIGINT N1, BIGINT N2, int ns);
+void interp_cube(FLT *out, FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT i1, BIGINT i2,
+                 BIGINT i3, BIGINT N1, BIGINT N2, BIGINT N3, int ns);
+void spread_subproblem_1d(BIGINT off1, BIGINT size1, FLT *du0, BIGINT M0, FLT *kx0,
+                          FLT *dd0, const finufft_spread_opts &opts);
+void spread_subproblem_2d(BIGINT off1, BIGINT off2, BIGINT size1, BIGINT size2, FLT *du0,
+                          BIGINT M0, FLT *kx0, FLT *ky0, FLT *dd0,
+                          const finufft_spread_opts &opts);
+void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, BIGINT size1,
+                          BIGINT size2, BIGINT size3, FLT *du0, BIGINT M0, FLT *kx0,
+                          FLT *ky0, FLT *kz0, FLT *dd0, const finufft_spread_opts &opts);
+void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT size1,
+                         BIGINT size2, BIGINT size3, BIGINT N1, BIGINT N2, BIGINT N3,
+                         FLT *data_uniform, FLT *du0);
+void add_wrapped_subgrid_thread_safe(BIGINT offset1, BIGINT offset2, BIGINT offset3,
+                                     BIGINT size1, BIGINT size2, BIGINT size3, BIGINT N1,
+                                     BIGINT N2, BIGINT N3, FLT *data_uniform, FLT *du0);
+void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGINT N1,
+                           BIGINT N2, BIGINT N3, double bin_size_x, double bin_size_y,
+                           double bin_size_z, int debug);
+void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGINT N1,
+                          BIGINT N2, BIGINT N3, double bin_size_x, double bin_size_y,
+                          double bin_size_z, int debug, int nthr);
+void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &size1,
+                 BIGINT &size2, BIGINT &size3, BIGINT M0, FLT *kx0, FLT *ky0, FLT *kz0,
+                 int ns, int ndims);
 
 // ==========================================================================
-int spreadinterp(
-        BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform,
-        BIGINT M, FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform,
-        finufft_spread_opts opts)
+int spreadinterp(BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx,
+                 FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts)
 /* ------------Spreader/interpolator for 1, 2, or 3 dimensions --------------
    If opts.spread_direction=1, evaluate, in the 1D case,
 
@@ -96,13 +93,13 @@ int spreadinterp(
    Inputs:
    N1,N2,N3 - grid sizes in x (fastest), y (medium), z (slowest) respectively.
               If N2==1, 1D spreading is done. If N3==1, 2D spreading.
-	      Otherwise, 3D.
+        Otherwise, 3D.
    M - number of NU pts.
    kx, ky, kz - length-M real arrays of NU point coordinates (only kx read in
                 1D, only kx and ky read in 2D).
 
-		These should lie in the box -pi<=kx<=pi. Points outside this domain are also
-		correctly folded back into this domain.
+    These should lie in the box -pi<=kx<=pi. Points outside this domain are also
+    correctly folded back into this domain.
    opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h
 
    Inputs/Outputs:
@@ -126,16 +123,15 @@ int spreadinterp(
 */
 {
   int ier = spreadcheck(N1, N2, N3, M, kx, ky, kz, opts);
-  if (ier)
-    return ier;
-  BIGINT* sort_indices = (BIGINT*)malloc(sizeof(BIGINT)*M);
+  if (ier) return ier;
+  BIGINT *sort_indices = (BIGINT *)malloc(sizeof(BIGINT) * M);
   if (!sort_indices) {
-    fprintf(stderr,"%s failed to allocate sort_indices!\n",__func__);
+    fprintf(stderr, "%s failed to allocate sort_indices!\n", __func__);
     return FINUFFT_ERR_SPREAD_ALLOC;
   }
   int did_sort = indexSort(sort_indices, N1, N2, N3, M, kx, ky, kz, opts);
-  spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform,
-                     M, kx, ky, kz, data_nonuniform, opts, did_sort);
+  spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz,
+                     data_nonuniform, opts, did_sort);
   free(sort_indices);
   return 0;
 }
@@ -145,36 +141,38 @@ static int ndims_from_Ns(BIGINT N1, BIGINT N2, BIGINT N3)
    Split out, Barnett 7/26/18
 */
 {
-  int ndims = 1;                // decide ndims: 1,2 or 3
-  if (N2>1) ++ndims;
-  if (N3>1) ++ndims;
+  int ndims = 1; // decide ndims: 1,2 or 3
+  if (N2 > 1) ++ndims;
+  if (N3 > 1) ++ndims;
   return ndims;
 }
 
-int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky,
-                FLT *kz, finufft_spread_opts opts)
+int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
+                finufft_spread_opts opts)
 /* This does just the input checking and reporting for the spreader.
    See spreadinterp() for input arguments and meaning of returned value.
    Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18.
-   Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to [-3pi,3pi)
+   Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to
+   [-3pi,3pi)
 */
 {
   // INPUT CHECKING & REPORTING .... cuboid not too small for spreading?
-  int minN = 2*opts.nspread;
-  if (N1<minN || (N2>1 && N2<minN) || (N3>1 && N3<minN)) {
-    fprintf(stderr,"%s error: one or more non-trivial box dims is less than 2.nspread!\n",__func__);
+  int minN = 2 * opts.nspread;
+  if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) {
+    fprintf(stderr,
+            "%s error: one or more non-trivial box dims is less than 2.nspread!\n",
+            __func__);
     return FINUFFT_ERR_SPREAD_BOX_SMALL;
   }
-  if (opts.spread_direction!=1 && opts.spread_direction!=2) {
-    fprintf(stderr,"%s error: opts.spread_direction must be 1 or 2!\n",__func__);
+  if (opts.spread_direction != 1 && opts.spread_direction != 2) {
+    fprintf(stderr, "%s error: opts.spread_direction must be 1 or 2!\n", __func__);
     return FINUFFT_ERR_SPREAD_DIR;
   }
   return 0;
 }
 
-
-int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, 
-               FLT *kx, FLT *ky, FLT *kz, finufft_spread_opts opts)
+int indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx,
+              FLT *ky, FLT *kz, finufft_spread_opts opts)
 /* This makes a decision whether or not to sort the NU pts (influenced by
    opts.sort), and if yes, calls either single- or multi-threaded bin sort,
    writing reordered index list to sort_indices. If decided not to sort, the
@@ -200,398 +198,423 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
 */
 {
   CNTime timer;
-  int ndims = ndims_from_Ns(N1,N2,N3);
-  BIGINT N=N1*N2*N3;            // U grid (periodic box) sizes
-  
+  int ndims = ndims_from_Ns(N1, N2, N3);
+  BIGINT N  = N1 * N2 * N3; // U grid (periodic box) sizes
+
   // heuristic binning box size for U grid... affects performance:
   double bin_size_x = 16, bin_size_y = 4, bin_size_z = 4;
   // put in heuristics based on cache sizes (only useful for single-thread) ?
 
-  int better_to_sort = !(ndims==1 && (opts.spread_direction==2 || (M > 1000*N1))); // 1D small-N or dir=2 case: don't sort
-
-  timer.start();                 // if needed, sort all the NU pts...
-  int did_sort=0;
-  int maxnthr = MY_OMP_GET_MAX_THREADS();  // used if both below opts default
-  if (opts.nthreads>0)
-    maxnthr = opts.nthreads;         // user nthreads overrides, without limit
-  if (opts.sort_threads>0)
-    maxnthr = opts.sort_threads;     // high-priority override, also no limit
+  int better_to_sort =
+      !(ndims == 1 && (opts.spread_direction == 2 || (M > 1000 * N1))); // 1D small-N or
+                                                                        // dir=2 case:
+                                                                        // don't sort
+
+  timer.start();                           // if needed, sort all the NU pts...
+  int did_sort = 0;
+  int maxnthr  = MY_OMP_GET_MAX_THREADS(); // used if both below opts default
+  if (opts.nthreads > 0)
+    maxnthr = opts.nthreads;               // user nthreads overrides, without limit
+  if (opts.sort_threads > 0)
+    maxnthr = opts.sort_threads;           // high-priority override, also no limit
   // At this point: maxnthr = the max threads sorting could use
   // (we don't print warning here, since: no showwarn in spread_opts, and finufft
   // already warned about it. spreadinterp-only advanced users will miss a warning)
-  if (opts.sort==1 || (opts.sort==2 && better_to_sort)) {
+  if (opts.sort == 1 || (opts.sort == 2 && better_to_sort)) {
     // store a good permutation ordering of all NU pts (dim=1,2 or 3)
-    int sort_debug = (opts.debug>=2);    // show timing output?
-    int sort_nthr = opts.sort_threads;   // 0, or user max # threads for sort
+    int sort_debug = (opts.debug >= 2); // show timing output?
+    int sort_nthr  = opts.sort_threads; // 0, or user max # threads for sort
 #ifndef _OPENMP
-    sort_nthr = 1;                       // if single-threaded lib, override user
+    sort_nthr = 1;                      // if single-threaded lib, override user
 #endif
-    if (sort_nthr==0)   // multithreaded auto choice: when N>>M, one thread is better!
-      sort_nthr = (10*M>N) ? maxnthr : 1;     // heuristic
-    if (sort_nthr==1)
-      bin_sort_singlethread(sort_indices,M,kx,ky,kz,N1,N2,N3,bin_size_x,bin_size_y,bin_size_z,sort_debug);
-    else                                      // sort_nthr>1, user fixes # threads (>=2)
-      bin_sort_multithread(sort_indices,M,kx,ky,kz,N1,N2,N3,bin_size_x,bin_size_y,bin_size_z,sort_debug,sort_nthr);
-    if (opts.debug) 
-      printf("\tsorted (%d threads):\t%.3g s\n",sort_nthr,timer.elapsedsec());
-    did_sort=1;
+    if (sort_nthr == 0) // multithreaded auto choice: when N>>M, one thread is better!
+      sort_nthr = (10 * M > N) ? maxnthr : 1; // heuristic
+    if (sort_nthr == 1)
+      bin_sort_singlethread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
+                            bin_size_y, bin_size_z, sort_debug);
+    else // sort_nthr>1, user fixes # threads (>=2)
+      bin_sort_multithread(sort_indices, M, kx, ky, kz, N1, N2, N3, bin_size_x,
+                           bin_size_y, bin_size_z, sort_debug, sort_nthr);
+    if (opts.debug)
+      printf("\tsorted (%d threads):\t%.3g s\n", sort_nthr, timer.elapsedsec());
+    did_sort = 1;
   } else {
-#pragma omp parallel for num_threads(maxnthr) schedule(static,1000000)
-    for (BIGINT i=0; i<M; i++)                // here omp helps xeon, hinders i7
-      sort_indices[i]=i;                      // the identity permutation
+#pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000)
+    for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7
+      sort_indices[i] = i;         // the identity permutation
     if (opts.debug)
-      printf("\tnot sorted (sort=%d): \t%.3g s\n",(int)opts.sort,timer.elapsedsec());
+      printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec());
   }
   return did_sort;
 }
 
-
-int spreadinterpSorted(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, 
-		      FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		      FLT *data_nonuniform, finufft_spread_opts opts, int did_sort)
+int spreadinterpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3,
+                       FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
+                       FLT *data_nonuniform, finufft_spread_opts opts, int did_sort)
 /* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine.
    See spreadinterp() above for inputs arguments and definitions.
    Return value should always be 0 (no error reporting).
    Split out by Melody Shih, Jun 2018; renamed Barnett 5/20/20.
 */
 {
-  if (opts.spread_direction==1)  // ========= direction 1 (spreading) =======
-    spreadSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts, did_sort);
-  
-  else           // ================= direction 2 (interpolation) ===========
-    interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts, did_sort);
-  
+  if (opts.spread_direction == 1) // ========= direction 1 (spreading) =======
+    spreadSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform,
+                 opts, did_sort);
+
+  else // ================= direction 2 (interpolation) ===========
+    interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform,
+                 opts, did_sort);
+
   return 0;
 }
 
-
 // --------------------------------------------------------------------------
-int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, 
-		      FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		      FLT *data_nonuniform, finufft_spread_opts opts, int did_sort)
+int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform,
+                 BIGINT M, FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform,
+                 finufft_spread_opts opts, int did_sort)
 // Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc.
 {
   CNTime timer;
-  int ndims = ndims_from_Ns(N1,N2,N3);
-  BIGINT N=N1*N2*N3;            // output array size
-  int ns=opts.nspread;          // abbrev. for w, kernel width
-  int nthr = MY_OMP_GET_MAX_THREADS();  // guess # threads to use to spread
-  if (opts.nthreads>0)
-    nthr = opts.nthreads;       // user override, now without limit
+  int ndims = ndims_from_Ns(N1, N2, N3);
+  BIGINT N  = N1 * N2 * N3;                    // output array size
+  int ns    = opts.nspread;                    // abbrev. for w, kernel width
+  int nthr  = MY_OMP_GET_MAX_THREADS();        // guess # threads to use to spread
+  if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit
 #ifndef _OPENMP
-  nthr = 1;                   // single-threaded lib must override user
+  nthr = 1;                                    // single-threaded lib must override user
 #endif
   if (opts.debug)
-    printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,nthr);
-  
+    printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims,
+           (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr);
+
   timer.start();
-  for (BIGINT i=0; i<2*N; i++) // zero the output array. std::fill is no faster
-    data_uniform[i]=0.0;
-  if (opts.debug) printf("\tzero output array\t%.3g s\n",timer.elapsedsec());
-  if (M==0)                     // no NU pts, we're done
+  for (BIGINT i = 0; i < 2 * N; i++) // zero the output array. std::fill is no faster
+    data_uniform[i] = 0.0;
+  if (opts.debug) printf("\tzero output array\t%.3g s\n", timer.elapsedsec());
+  if (M == 0) // no NU pts, we're done
     return 0;
-  
-  int spread_single = (nthr==1) || (M*100<N);     // low-density heuristic?
-  spread_single = 0;                 // for now
+
+  int spread_single = (nthr == 1) || (M * 100 < N); // low-density heuristic?
+  spread_single     = 0;                            // for now
   timer.start();
-  if (spread_single) {    // ------- Basic single-core t1 spreading ------
-    for (BIGINT j=0; j<M; j++) {
+  if (spread_single) { // ------- Basic single-core t1 spreading ------
+    for (BIGINT j = 0; j < M; j++) {
       // *** todo, not urgent
       // ... (question is: will the index wrapping per NU pt slow it down?)
     }
-    if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n",timer.elapsedsec());
-    
-  } else {           // ------- Fancy multi-core blocked t1 spreading ----
-                     // Splits sorted inds (jfm's advanced2), could double RAM.
+    if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n", timer.elapsedsec());
+
+  } else { // ------- Fancy multi-core blocked t1 spreading ----
+           // Splits sorted inds (jfm's advanced2), could double RAM.
     // choose nb (# subprobs) via used nthreads:
-    int nb = min((BIGINT)nthr,M);         // simply split one subprob per thr...
-    if (nb*(BIGINT)opts.max_subproblem_size<M) {  // ...or more subprobs to cap size
-      nb = 1 + (M-1)/opts.max_subproblem_size;  // int div does ceil(M/opts.max_subproblem_size)
-      if (opts.debug) printf("\tcapping subproblem sizes to max of %d\n",opts.max_subproblem_size);
+    int nb = min((BIGINT)nthr, M); // simply split one subprob per thr...
+    if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap
+                                                     // size
+      nb = 1 + (M - 1) / opts.max_subproblem_size;   // int div does
+                                                     // ceil(M/opts.max_subproblem_size)
+      if (opts.debug)
+        printf("\tcapping subproblem sizes to max of %d\n", opts.max_subproblem_size);
     }
-    if (M*1000<N) {         // low-density heuristic: one thread per NU pt!
+    if (M * 1000 < N) { // low-density heuristic: one thread per NU pt!
       nb = M;
       if (opts.debug) printf("\tusing low-density speed rescue nb=M...\n");
     }
-    if (!did_sort && nthr==1) {
+    if (!did_sort && nthr == 1) {
       nb = 1;
       if (opts.debug) printf("\tunsorted nthr=1: forcing single subproblem...\n");
     }
-    if (opts.debug && nthr>opts.atomic_threshold)
+    if (opts.debug && nthr > opts.atomic_threshold)
       printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n");
-      
-    std::vector<BIGINT> brk(nb+1); // NU index breakpoints defining nb subproblems
-    for (int p=0;p<=nb;++p)
-      brk[p] = (BIGINT)(0.5 + M*p/(double)nb);
-    
-#pragma omp parallel for num_threads(nthr) schedule(dynamic,1)  // each is big
-      for (int isub=0; isub<nb; isub++) {   // Main loop through the subproblems
-        BIGINT M0 = brk[isub+1]-brk[isub];  // # NU pts in this subproblem
-        // copy the location and data vectors for the nonuniform points
-        FLT *kx0=(FLT*)malloc(sizeof(FLT)*M0), *ky0=NULL, *kz0=NULL;
-        if (N2>1)
-          ky0=(FLT*)malloc(sizeof(FLT)*M0);
-        if (N3>1)
-          kz0=(FLT*)malloc(sizeof(FLT)*M0);
-        FLT *dd0=(FLT*)malloc(sizeof(FLT)*M0*2);    // complex strength data
-        for (BIGINT j=0; j<M0; j++) {           // todo: can avoid this copying?
-          BIGINT kk=sort_indices[j+brk[isub]];  // NU pt from subprob index list
-          kx0[j]= fold_rescale(kx[kk], N1);
-          if (N2>1) ky0[j]= fold_rescale(ky[kk], N2);
-          if (N3>1) kz0[j]= fold_rescale(kz[kk], N3);
-          dd0[j*2]=data_nonuniform[kk*2];     // real part
-          dd0[j*2+1]=data_nonuniform[kk*2+1]; // imag part
-        }
-        // get the subgrid which will include padding by roughly nspread/2
-        BIGINT offset1,offset2,offset3,size1,size2,size3; // get_subgrid sets
-        get_subgrid(offset1,offset2,offset3,size1,size2,size3,M0,kx0,ky0,kz0,ns,ndims);  // sets offsets and sizes
-        if (opts.debug>1) { // verbose
-          if (ndims==1)
-            printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n",(long long)offset1,(long long)size1,(long long)M0);
-          else if (ndims==2)
-            printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n",(long long)offset1,(long long)offset2,(long long)size1,(long long)size2,(long long)M0);
-          else
-            printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n",(long long)offset1,(long long)offset2,(long long)offset3,(long long)size1,(long long)size2,(long long)size3,(long long)M0);
-	}
-        // allocate output data for this subgrid
-        FLT *du0=(FLT*)malloc(sizeof(FLT)*2*size1*size2*size3); // complex
-        
-        // Spread to subgrid without need for bounds checking or wrapping
-        if (!(opts.flags & TF_OMIT_SPREADING)) {
-          if (ndims==1)
-            spread_subproblem_1d(offset1,size1,du0,M0,kx0,dd0,opts);
-          else if (ndims==2)
-            spread_subproblem_2d(offset1,offset2,size1,size2,du0,M0,kx0,ky0,dd0,opts);
-          else
-            spread_subproblem_3d(offset1,offset2,offset3,size1,size2,size3,du0,M0,kx0,ky0,kz0,dd0,opts);
-	}
-        
-        // do the adding of subgrid to output
-        if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) {
-          if (nthr > opts.atomic_threshold)   // see above for debug reporting
-            add_wrapped_subgrid_thread_safe(offset1,offset2,offset3,size1,size2,size3,N1,N2,N3,data_uniform,du0);   // R Blackwell's atomic version
-          else {
+
+    std::vector<BIGINT> brk(nb + 1); // NU index breakpoints defining nb subproblems
+    for (int p = 0; p <= nb; ++p) brk[p] = (BIGINT)(0.5 + M * p / (double)nb);
+
+#pragma omp parallel for num_threads(nthr) schedule(dynamic, 1) // each is big
+    for (int isub = 0; isub < nb; isub++) {  // Main loop through the subproblems
+      BIGINT M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem
+      // copy the location and data vectors for the nonuniform points
+      FLT *kx0 = (FLT *)malloc(sizeof(FLT) * M0), *ky0 = NULL, *kz0 = NULL;
+      if (N2 > 1) ky0 = (FLT *)malloc(sizeof(FLT) * M0);
+      if (N3 > 1) kz0 = (FLT *)malloc(sizeof(FLT) * M0);
+      FLT *dd0 = (FLT *)malloc(sizeof(FLT) * M0 * 2); // complex strength data
+      for (BIGINT j = 0; j < M0; j++) {               // todo: can avoid this copying?
+        BIGINT kk = sort_indices[j + brk[isub]];      // NU pt from subprob index list
+        kx0[j]    = fold_rescale(kx[kk], N1);
+        if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2);
+        if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3);
+        dd0[j * 2]     = data_nonuniform[kk * 2];     // real part
+        dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part
+      }
+      // get the subgrid which will include padding by roughly nspread/2
+      BIGINT offset1, offset2, offset3, size1, size2, size3; // get_subgrid sets
+      get_subgrid(offset1, offset2, offset3, size1, size2, size3, M0, kx0, ky0, kz0, ns,
+                  ndims);                                    // sets offsets and sizes
+      if (opts.debug > 1) {                                  // verbose
+        if (ndims == 1)
+          printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1,
+                 (long long)size1, (long long)M0);
+        else if (ndims == 2)
+          printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n",
+                 (long long)offset1, (long long)offset2, (long long)size1,
+                 (long long)size2, (long long)M0);
+        else
+          printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n",
+                 (long long)offset1, (long long)offset2, (long long)offset3,
+                 (long long)size1, (long long)size2, (long long)size3, (long long)M0);
+      }
+      // allocate output data for this subgrid
+      FLT *du0 = (FLT *)malloc(sizeof(FLT) * 2 * size1 * size2 * size3); // complex
+
+      // Spread to subgrid without need for bounds checking or wrapping
+      if (!(opts.flags & TF_OMIT_SPREADING)) {
+        if (ndims == 1)
+          spread_subproblem_1d(offset1, size1, du0, M0, kx0, dd0, opts);
+        else if (ndims == 2)
+          spread_subproblem_2d(offset1, offset2, size1, size2, du0, M0, kx0, ky0, dd0,
+                               opts);
+        else
+          spread_subproblem_3d(offset1, offset2, offset3, size1, size2, size3, du0, M0,
+                               kx0, ky0, kz0, dd0, opts);
+      }
+
+      // do the adding of subgrid to output
+      if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) {
+        if (nthr > opts.atomic_threshold) // see above for debug reporting
+          add_wrapped_subgrid_thread_safe(offset1, offset2, offset3, size1, size2, size3,
+                                          N1, N2, N3, data_uniform, du0); // R Blackwell's
+                                                                          // atomic
+                                                                          // version
+        else {
 #pragma omp critical
-            add_wrapped_subgrid(offset1,offset2,offset3,size1,size2,size3,N1,N2,N3,data_uniform,du0);
-          }
+          add_wrapped_subgrid(offset1, offset2, offset3, size1, size2, size3, N1, N2, N3,
+                              data_uniform, du0);
         }
+      }
 
-        // free up stuff from this subprob... (that was malloc'ed by hand)
-        free(dd0);
-        free(du0);
-        free(kx0);
-        if (N2>1) free(ky0);
-        if (N3>1) free(kz0); 
-      }     // end main loop over subprobs
-      if (opts.debug) printf("\tt1 fancy spread: \t%.3g s (%d subprobs)\n",timer.elapsedsec(), nb);
-    }   // end of choice of which t1 spread type to use
-    return 0;
+      // free up stuff from this subprob... (that was malloc'ed by hand)
+      free(dd0);
+      free(du0);
+      free(kx0);
+      if (N2 > 1) free(ky0);
+      if (N3 > 1) free(kz0);
+    } // end main loop over subprobs
+    if (opts.debug)
+      printf("\tt1 fancy spread: \t%.3g s (%d subprobs)\n", timer.elapsedsec(), nb);
+  } // end of choice of which t1 spread type to use
+  return 0;
 };
 
-
 // --------------------------------------------------------------------------
-int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, 
-		      FLT *data_uniform,BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-		      FLT *data_nonuniform, finufft_spread_opts opts, int did_sort)
+int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform,
+                 BIGINT M, FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform,
+                 finufft_spread_opts opts, int did_sort)
 // Interpolate to NU pts in sorted order from a uniform grid.
 // See spreadinterp() for doc.
 {
   CNTime timer;
-  int ndims = ndims_from_Ns(N1,N2,N3);
-  int ns=opts.nspread;          // abbrev. for w, kernel width
-  FLT ns2 = (FLT)ns/2;          // half spread width, used as stencil shift
-  int nthr = MY_OMP_GET_MAX_THREADS();   // guess # threads to use to interp
-  if (opts.nthreads>0)
-    nthr = opts.nthreads;       // user override, now without limit
+  int ndims = ndims_from_Ns(N1, N2, N3);
+  int ns    = opts.nspread;                    // abbrev. for w, kernel width
+  FLT ns2   = (FLT)ns / 2;                     // half spread width, used as stencil shift
+  int nthr  = MY_OMP_GET_MAX_THREADS();        // guess # threads to use to interp
+  if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit
 #ifndef _OPENMP
-  nthr = 1;                   // single-threaded lib must override user
+  nthr = 1;                                    // single-threaded lib must override user
 #endif
   if (opts.debug)
-    printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n",ndims,(long long)M,(long long)N1,(long long)N2,(long long)N3,nthr);
+    printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims,
+           (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr);
 
-  timer.start();  
+  timer.start();
 #pragma omp parallel num_threads(nthr)
   {
-#define CHUNKSIZE 16     // Chunks of Type 2 targets (Ludvig found by expt)
+#define CHUNKSIZE 16 // Chunks of Type 2 targets (Ludvig found by expt)
     BIGINT jlist[CHUNKSIZE];
     FLT xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE];
-    FLT outbuf[2*CHUNKSIZE];
+    FLT outbuf[2 * CHUNKSIZE];
     // Kernels: static alloc is faster, so we do it for up to 3D...
-    FLT kernel_args[3*MAX_NSPREAD];
-    FLT kernel_values[3*MAX_NSPREAD];
+    FLT kernel_args[3 * MAX_NSPREAD];
+    FLT kernel_values[3 * MAX_NSPREAD];
     FLT *ker1 = kernel_values;
     FLT *ker2 = kernel_values + ns;
-    FLT *ker3 = kernel_values + 2*ns;       
+    FLT *ker3 = kernel_values + 2 * ns;
 
     // Loop over interpolation chunks
-#pragma omp for schedule (dynamic,1000)  // assign threads to NU targ pts:
-    for (BIGINT i=0; i<M; i+=CHUNKSIZE)  // main loop over NU targs, interp each from U
-      {
-        // Setup buffers for this chunk
-        int bufsize = (i+CHUNKSIZE > M) ? M-i : CHUNKSIZE;
-        for (int ibuf=0; ibuf<bufsize; ibuf++) {
-          BIGINT j = sort_indices[i+ibuf];
-          jlist[ibuf] = j;
-	  xjlist[ibuf] = fold_rescale(kx[j], N1);
-	  if(ndims >=2)
-	    yjlist[ibuf] = fold_rescale(ky[j], N2);
-	  if(ndims == 3)
-	    zjlist[ibuf] = fold_rescale(kz[j], N3);
-	}
-      
-    // Loop over targets in chunk
-    for (int ibuf=0; ibuf<bufsize; ibuf++) {
-      FLT xj = xjlist[ibuf];
-      FLT yj = (ndims > 1) ? yjlist[ibuf] : 0;
-      FLT zj = (ndims > 2) ? zjlist[ibuf] : 0;
-
-      FLT *target = outbuf+2*ibuf;
-        
-      // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ
-      BIGINT i1=(BIGINT)std::ceil(xj-ns2); // leftmost grid index
-      BIGINT i2= (ndims > 1) ? (BIGINT)std::ceil(yj-ns2) : 0; // min y grid index
-      BIGINT i3= (ndims > 2) ? (BIGINT)std::ceil(zj-ns2) : 0; // min z grid index
-     
-      FLT x1=(FLT)i1-xj;           // shift of ker center, in [-w/2,-w/2+1]
-      FLT x2= (ndims > 1) ? (FLT)i2-yj : 0 ;
-      FLT x3= (ndims > 2)? (FLT)i3-zj : 0;
-
-      // eval kernel values patch and use to interpolate from uniform data...
-      if (!(opts.flags & TF_OMIT_SPREADING)) {
+#pragma omp for schedule(dynamic, 1000)       // assign threads to NU targ pts:
+    for (BIGINT i = 0; i < M; i += CHUNKSIZE) // main loop over NU targs, interp each
+                                              // from U
+    {
+      // Setup buffers for this chunk
+      int bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE;
+      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+        BIGINT j     = sort_indices[i + ibuf];
+        jlist[ibuf]  = j;
+        xjlist[ibuf] = fold_rescale(kx[j], N1);
+        if (ndims >= 2) yjlist[ibuf] = fold_rescale(ky[j], N2);
+        if (ndims == 3) zjlist[ibuf] = fold_rescale(kz[j], N3);
+      }
+
+      // Loop over targets in chunk
+      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+        FLT xj = xjlist[ibuf];
+        FLT yj = (ndims > 1) ? yjlist[ibuf] : 0;
+        FLT zj = (ndims > 2) ? zjlist[ibuf] : 0;
 
-	  if (opts.kerevalmeth==0) {               // choose eval method
-	    set_kernel_args(kernel_args, x1, opts);
-	    if(ndims > 1)  set_kernel_args(kernel_args+ns, x2, opts);
-	    if(ndims > 2)  set_kernel_args(kernel_args+2*ns, x3, opts);
-	    
-	    evaluate_kernel_vector(kernel_values, kernel_args, opts, ndims*ns);
-	  }
-
-	  else{
-            // due to ns being padded up to next multiple of 4 in the eval_kernel_vec_Horner
-            // and writing zeros out to this padded size, these must occur in the order x,y,z...
-	    eval_kernel_vec_Horner(ker1,x1,ns,opts);
-	    if (ndims > 1) eval_kernel_vec_Horner(ker2,x2,ns,opts);  
-	    if (ndims > 2) eval_kernel_vec_Horner(ker3,x3,ns,opts);
-	  }
-
-	  switch(ndims){
-	  case 1:
-	    interp_line(target,data_uniform,ker1,i1,N1,ns);
-	    break;
-	  case 2:
-	    interp_square(target,data_uniform,ker1,ker2,i1,i2,N1,N2,ns);
-	    break;
-	  case 3:
-	    interp_cube(target,data_uniform,ker1,ker2,ker3,i1,i2,i3,N1,N2,N3,ns);
-	    break;
-	  default: //can't get here
-	    break;
-	     
-	  }	 
+        FLT *target = outbuf + 2 * ibuf;
+
+        // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ
+        BIGINT i1 = (BIGINT)std::ceil(xj - ns2);                   // leftmost grid index
+        BIGINT i2 = (ndims > 1) ? (BIGINT)std::ceil(yj - ns2) : 0; // min y grid
+                                                                   // index
+        BIGINT i3 = (ndims > 2) ? (BIGINT)std::ceil(zj - ns2) : 0; // min z grid
+                                                                   // index
+
+        FLT x1 = (FLT)i1 - xj; // shift of ker center, in [-w/2,-w/2+1]
+        FLT x2 = (ndims > 1) ? (FLT)i2 - yj : 0;
+        FLT x3 = (ndims > 2) ? (FLT)i3 - zj : 0;
+
+        // eval kernel values patch and use to interpolate from uniform data...
+        if (!(opts.flags & TF_OMIT_SPREADING)) {
+
+          if (opts.kerevalmeth == 0) { // choose eval method
+            set_kernel_args(kernel_args, x1, opts);
+            if (ndims > 1) set_kernel_args(kernel_args + ns, x2, opts);
+            if (ndims > 2) set_kernel_args(kernel_args + 2 * ns, x3, opts);
+
+            evaluate_kernel_vector(kernel_values, kernel_args, opts, ndims * ns);
+          }
+
+          else {
+            // due to ns being padded up to next multiple of 4 in the
+            // eval_kernel_vec_Horner and writing zeros out to this padded
+            // size, these must occur in the order x,y,z...
+            eval_kernel_vec_Horner(ker1, x1, ns, opts);
+            if (ndims > 1) eval_kernel_vec_Horner(ker2, x2, ns, opts);
+            if (ndims > 2) eval_kernel_vec_Horner(ker3, x3, ns, opts);
+          }
+
+          switch (ndims) {
+          case 1:
+            interp_line(target, data_uniform, ker1, i1, N1, ns);
+            break;
+          case 2:
+            interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, N2, ns);
+            break;
+          case 3:
+            interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, N1, N2, N3,
+                        ns);
+            break;
+          default: // can't get here
+            break;
+          }
+        }
+      } // end loop over targets in chunk
+
+      // Copy result buffer to output array
+      for (int ibuf = 0; ibuf < bufsize; ibuf++) {
+        BIGINT j                   = jlist[ibuf];
+        data_nonuniform[2 * j]     = outbuf[2 * ibuf];
+        data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1];
       }
-    } // end loop over targets in chunk
-        
-    // Copy result buffer to output array
-    for (int ibuf=0; ibuf<bufsize; ibuf++) {
-      BIGINT j = jlist[ibuf];
-      data_nonuniform[2*j] = outbuf[2*ibuf];
-      data_nonuniform[2*j+1] = outbuf[2*ibuf+1];              
-    }         
-        
-      } // end NU targ loop
+
+    } // end NU targ loop
   } // end parallel section
-  if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n",timer.elapsedsec());
+  if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n", timer.elapsedsec());
   return 0;
 };
 
-
-
 ///////////////////////////////////////////////////////////////////////////
 
-int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac,
-                   int kerevalmeth, int debug, int showwarn, int dim)
+int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int kerevalmeth,
+                   int debug, int showwarn, int dim)
 /* Initializes spreader kernel parameters given desired NUFFT tolerance eps,
    upsampling factor (=sigma in paper, or R in Dutt-Rokhlin), ker eval meth
    (either 0:exp(sqrt()), 1: Horner ppval), and some debug-level flags.
-   Also sets all default options in finufft_spread_opts. See finufft_spread_opts.h for opts.
-   dim is spatial dimension (1,2, or 3).
-   See finufft.cpp:finufft_plan() for where upsampfac is set.
-   Must call this before any kernel evals done, otherwise segfault likely.
-   Returns:
-     0  : success
-     FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be achieved, but proceed with
-                          best possible eps
-     otherwise : failure (see codes in defs.h); spreading must not proceed
-   Barnett 2017. debug, loosened eps logic 6/14/20.
+   Also sets all default options in finufft_spread_opts. See finufft_spread_opts.h for
+   opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where
+   upsampfac is set. Must call this before any kernel evals done, otherwise segfault
+   likely. Returns: 0  : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be
+   achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h);
+   spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20.
 */
 {
-  if (upsampfac!=2.0 && upsampfac!=1.25) {   // nonstandard sigma
-    if (kerevalmeth==1) {
-      fprintf(stderr,"FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n",upsampfac);
+  if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma
+    if (kerevalmeth == 1) {
+      fprintf(stderr,
+              "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be "
+              "handled by kerevalmeth=1\n",
+              upsampfac);
       return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
-    if (upsampfac<=1.0) {       // no digits would result
-      fprintf(stderr,"FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",upsampfac);
+    if (upsampfac <= 1.0) { // no digits would result
+      fprintf(stderr, "FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",
+              upsampfac);
       return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }
     // calling routine must abort on above errors, since opts is garbage!
-    if (showwarn && upsampfac>4.0)
-      fprintf(stderr,"FINUFFT setup_spreader warning: upsampfac=%.3g way too large to be beneficial.\n",upsampfac);
+    if (showwarn && upsampfac > 4.0)
+      fprintf(stderr,
+              "FINUFFT setup_spreader warning: upsampfac=%.3g way too large to be "
+              "beneficial.\n",
+              upsampfac);
   }
-    
+
   // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft)
-  opts.spread_direction = 0;    // user should always set to 1 or 2 as desired
-  opts.sort = 2;                // 2:auto-choice
-  opts.kerpad = 0;              // affects only evaluate_kernel_vector
-  opts.kerevalmeth = kerevalmeth;
-  opts.upsampfac = upsampfac;
-  opts.nthreads = 0;            // all avail
-  opts.sort_threads = 0;        // 0:auto-choice
+  opts.spread_direction = 0; // user should always set to 1 or 2 as desired
+  opts.sort             = 2; // 2:auto-choice
+  opts.kerpad           = 0; // affects only evaluate_kernel_vector
+  opts.kerevalmeth      = kerevalmeth;
+  opts.upsampfac        = upsampfac;
+  opts.nthreads         = 0; // all avail
+  opts.sort_threads     = 0; // 0:auto-choice
   // heuristic dir=1 chunking for nthr>>1, typical for intel i7 and skylake...
-  opts.max_subproblem_size = (dim==1) ? 10000 : 100000;
-  opts.flags = 0;               // 0:no timing flags (>0 for experts only)
-  opts.debug = 0;               // 0:no debug output
+  opts.max_subproblem_size = (dim == 1) ? 10000 : 100000;
+  opts.flags               = 0; // 0:no timing flags (>0 for experts only)
+  opts.debug               = 0; // 0:no debug output
   // heuristic nthr above which switch OMP critical to atomic (add_wrapped...):
-  opts.atomic_threshold = 10;   // R Blackwell's value
+  opts.atomic_threshold = 10; // R Blackwell's value
 
-  int ns, ier = 0;  // Set kernel width w (aka ns, nspread) then copy to opts...
-  if (eps<EPSILON) {            // safety; there's no hope of beating e_mach
+  int ns, ier = 0;            // Set kernel width w (aka ns, nspread) then copy to opts...
+  if (eps < EPSILON) {        // safety; there's no hope of beating e_mach
     if (showwarn)
-      fprintf(stderr,"%s warning: increasing tol=%.3g to eps_mach=%.3g.\n",__func__,(double)eps,(double)EPSILON);
-    eps = EPSILON;              // only changes local copy (not any opts)
+      fprintf(stderr, "%s warning: increasing tol=%.3g to eps_mach=%.3g.\n", __func__,
+              (double)eps, (double)EPSILON);
+    eps = EPSILON; // only changes local copy (not any opts)
     ier = FINUFFT_WARN_EPS_TOO_SMALL;
   }
-  if (upsampfac==2.0)           // standard sigma (see SISC paper)
-    ns = std::ceil(-log10(eps/(FLT)10.0));          // 1 digit per power of 10
-  else                          // custom sigma
-    ns = std::ceil(-log(eps) / (PI*sqrt(1.0-1.0/upsampfac)));  // formula, gam=1
-  ns = max(2,ns);               // (we don't have ns=1 version yet)
-  if (ns>MAX_NSPREAD) {         // clip to fit allocated arrays, Horner rules
+  if (upsampfac == 2.0)                      // standard sigma (see SISC paper)
+    ns = std::ceil(-log10(eps / (FLT)10.0)); // 1 digit per power of 10
+  else                                       // custom sigma
+    ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1
+  ns = max(2, ns);        // (we don't have ns=1 version yet)
+  if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules
     if (showwarn)
-      fprintf(stderr,"%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; clipping to max %d.\n",__func__,
-              upsampfac,(double)eps,ns,MAX_NSPREAD);
-    ns = MAX_NSPREAD;
+      fprintf(stderr,
+              "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width "
+              "ns=%d; clipping to max %d.\n",
+              __func__, upsampfac, (double)eps, ns, MAX_NSPREAD);
+    ns  = MAX_NSPREAD;
     ier = FINUFFT_WARN_EPS_TOO_SMALL;
   }
   opts.nspread = ns;
 
   // setup for reference kernel eval (via formula): select beta width param...
   // (even when kerevalmeth=1, this ker eval needed for FTs in onedim_*_kernel)
-  opts.ES_halfwidth=(double)ns/2;   // constants to help (see below routines)
-  opts.ES_c = 4.0/(double)(ns*ns);
-  double betaoverns = 2.30;         // gives decent betas for default sigma=2.0
-  if (ns==2) betaoverns = 2.20;  // some small-width tweaks...
-  if (ns==3) betaoverns = 2.26;
-  if (ns==4) betaoverns = 2.38;
-  if (upsampfac!=2.0) {          // again, override beta for custom sigma
-    FLT gamma=0.97;              // must match devel/gen_all_horner_C_code.m !
-    betaoverns = gamma*PI*(1.0-1.0/(2*upsampfac));  // formula based on cutoff
+  opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines)
+  opts.ES_c         = 4.0 / (double)(ns * ns);
+  double betaoverns = 2.30;           // gives decent betas for default sigma=2.0
+  if (ns == 2) betaoverns = 2.20;     // some small-width tweaks...
+  if (ns == 3) betaoverns = 2.26;
+  if (ns == 4) betaoverns = 2.38;
+  if (upsampfac != 2.0) { // again, override beta for custom sigma
+    FLT gamma  = 0.97;    // must match devel/gen_all_horner_C_code.m !
+    betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on
+                                                             // cutoff
   }
-  opts.ES_beta = betaoverns * ns;   // set the kernel beta parameter
+  opts.ES_beta = betaoverns * ns; // set the kernel beta parameter
   if (debug)
-    printf("%s (kerevalmeth=%d) eps=%.3g sigma=%.3g: chose ns=%d beta=%.3g\n",__func__,kerevalmeth,(double)eps,upsampfac,ns,opts.ES_beta);
-  
+    printf("%s (kerevalmeth=%d) eps=%.3g sigma=%.3g: chose ns=%d beta=%.3g\n", __func__,
+           kerevalmeth, (double)eps, upsampfac, ns, opts.ES_beta);
+
   return ier;
 }
 
@@ -603,23 +626,23 @@ FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts)
    This is the "reference implementation", used by eg finufft/onedim_* 2/17/17
 */
 {
-  if (abs(x)>=(FLT)opts.ES_halfwidth)
+  if (abs(x) >= (FLT)opts.ES_halfwidth)
     // if spreading/FT careful, shouldn't need this if, but causes no speed hit
     return 0.0;
   else
-    return exp((FLT)opts.ES_beta * sqrt((FLT)1.0 - (FLT)opts.ES_c*x*x));
+    return exp((FLT)opts.ES_beta * sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x));
 }
 
-static inline void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts& opts)
+static inline void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts &opts)
 // Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1.
 // needed for the vectorized kernel eval of Ludvig af K.
 {
-  int ns=opts.nspread;
-  for (int i=0; i<ns; i++)
-    args[i] = x + (FLT) i;
+  int ns = opts.nspread;
+  for (int i = 0; i < ns; i++) args[i] = x + (FLT)i;
 }
 
-static inline void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts& opts, const int N)
+static inline void evaluate_kernel_vector(FLT *ker, FLT *args,
+                                          const finufft_spread_opts &opts, const int N)
 /* Evaluate ES kernel for a vector of N arguments; by Ludvig af K.
    If opts.kerpad true, args and ker must be allocated for Npad, and args is
    written to (to pad to length Npad), only first N outputs are correct.
@@ -635,46 +658,46 @@ static inline void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spr
     // seems to benefit auto-vectorization.
     // gcc 5.4 vectorizes first loop; gcc 7.2 vectorizes both loops
     int Npad = N;
-    if (opts.kerpad) {        // since always same branch, no speed hit
-      Npad = 4*(1+(N-1)/4);   // pad N to mult of 4; help i7 GCC, not xeon
-      for (int i=N;i<Npad;++i)    // pad with 1-3 zeros for safe eval
-	args[i] = 0.0;
+    if (opts.kerpad) {               // since always same branch, no speed hit
+      Npad = 4 * (1 + (N - 1) / 4);  // pad N to mult of 4; help i7 GCC, not xeon
+      for (int i = N; i < Npad; ++i) // pad with 1-3 zeros for safe eval
+        args[i] = 0.0;
     }
     for (int i = 0; i < Npad; i++) { // Loop 1: Compute exponential arguments
-      ker[i] = b * sqrt((FLT)1.0 - c*args[i]*args[i]);  // care! 1.0 is double
+      ker[i] = b * sqrt((FLT)1.0 - c * args[i] * args[i]); // care! 1.0 is double
     }
     if (!(opts.flags & TF_OMIT_EVALUATE_EXPONENTIAL))
       for (int i = 0; i < Npad; i++) // Loop 2: Compute exponentials
-	ker[i] = exp(ker[i]);
+        ker[i] = exp(ker[i]);
   } else {
-    for (int i = 0; i < N; i++)             // dummy for timing only
+    for (int i = 0; i < N; i++) // dummy for timing only
       ker[i] = 1.0;
   }
   // Separate check from arithmetic (Is this really needed? doesn't slow down)
   for (int i = 0; i < N; i++)
-    if (abs(args[i])>=(FLT)opts.ES_halfwidth) ker[i] = 0.0;
+    if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0;
 }
 
 static inline void eval_kernel_vec_Horner(FLT *ker, const FLT x, const int w,
-					  const finufft_spread_opts &opts)
+                                          const finufft_spread_opts &opts)
 /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at
    x_j = x + j,  for j=0,..,w-1.  Thus x in [-w/2,-w/2+1].   w is aka ns.
    This is the current evaluation method, since it's faster (except i7 w=16).
    Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */
 {
   if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) {
-    FLT z = (FLT)2.0*x + w - (FLT)1.0; // scale so local grid offset z in [-1,1]
+    FLT z = (FLT)2.0 * x + w - (FLT)1.0; // scale so local grid offset z in [-1,1]
     // insert the auto-generated code which expects z, w args, writes to ker...
-    if (opts.upsampfac==2.0) {     // floating point equality is fine here
+    if (opts.upsampfac == 2.0) { // floating point equality is fine here
 #include "ker_horner_allw_loop.c"
-    } else if (opts.upsampfac==1.25) {
+    } else if (opts.upsampfac == 1.25) {
 #include "ker_lowupsampfac_horner_allw_loop.c"
     } else
-      fprintf(stderr,"%s: unknown upsampfac, failed!\n",__func__);
+      fprintf(stderr, "%s: unknown upsampfac, failed!\n", __func__);
   }
 }
 
-void interp_line(FLT *target,FLT *du, FLT *ker,BIGINT i1,BIGINT N1,int ns)
+void interp_line(FLT *target, FLT *du, FLT *ker, BIGINT i1, BIGINT N1, int ns)
 /* 1D interpolate complex values from size-ns block of the du (uniform grid
    data) array to a single complex output value "target", using as weights the
    1d kernel evaluation list ker1.
@@ -693,36 +716,36 @@ void interp_line(FLT *target,FLT *du, FLT *ker,BIGINT i1,BIGINT N1,int ns)
 */
 {
   FLT out[] = {0.0, 0.0};
-  BIGINT j = i1;
-  if (i1<0) {                               // wraps at left
-    j+=N1;
-    for (int dx=0; dx<-i1; ++dx) {
-      out[0] += du[2*j]*ker[dx];
-      out[1] += du[2*j+1]*ker[dx];
+  BIGINT j  = i1;
+  if (i1 < 0) { // wraps at left
+    j += N1;
+    for (int dx = 0; dx < -i1; ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
       ++j;
     }
-    j-=N1;
-    for (int dx=-i1; dx<ns; ++dx) {
-      out[0] += du[2*j]*ker[dx];
-      out[1] += du[2*j+1]*ker[dx];
+    j -= N1;
+    for (int dx = -i1; dx < ns; ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
       ++j;
     }
-  } else if (i1+ns>=N1) {                    // wraps at right
-    for (int dx=0; dx<N1-i1; ++dx) {
-      out[0] += du[2*j]*ker[dx];
-      out[1] += du[2*j+1]*ker[dx];
+  } else if (i1 + ns >= N1) { // wraps at right
+    for (int dx = 0; dx < N1 - i1; ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
       ++j;
     }
-    j-=N1;
-    for (int dx=N1-i1; dx<ns; ++dx) {
-      out[0] += du[2*j]*ker[dx];
-      out[1] += du[2*j+1]*ker[dx];
+    j -= N1;
+    for (int dx = N1 - i1; dx < ns; ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
       ++j;
     }
-  } else {                                     // doesn't wrap
-    for (int dx=0; dx<ns; ++dx) {
-      out[0] += du[2*j]*ker[dx];
-      out[1] += du[2*j+1]*ker[dx];
+  } else { // doesn't wrap
+    for (int dx = 0; dx < ns; ++dx) {
+      out[0] += du[2 * j] * ker[dx];
+      out[1] += du[2 * j + 1] * ker[dx];
       ++j;
     }
   }
@@ -730,7 +753,8 @@ void interp_line(FLT *target,FLT *du, FLT *ker,BIGINT i1,BIGINT N1,int ns)
   target[1] = out[1];
 }
 
-void interp_square(FLT *target,FLT *du, FLT *ker1, FLT *ker2, BIGINT i1,BIGINT i2,BIGINT N1,BIGINT N2,int ns)
+void interp_square(FLT *target, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2,
+                   BIGINT N1, BIGINT N2, int ns)
 /* 2D interpolate complex values from a ns*ns block of the du (uniform grid
    data) array to a single complex output value "target", using as weights the
    ns*ns outer product of the 1d kernel lists ker1 and ker2.
@@ -760,56 +784,56 @@ void interp_square(FLT *target,FLT *du, FLT *ker1, FLT *ker2, BIGINT i1,BIGINT i
 */
 {
   FLT out[] = {0.0, 0.0};
-  if (i1>=0 && i1+ns<=N1 && i2>=0 && i2+ns<=N2) {  // no wrapping: avoid ptrs
-    FLT line[2*MAX_NSPREAD];   // store a horiz line (interleaved real,imag)
+  if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) { // no wrapping: avoid ptrs
+    FLT line[2 * MAX_NSPREAD]; // store a horiz line (interleaved real,imag)
     // block for first y line, to avoid explicitly initializing line with zeros
     {
-      const FLT *lptr = du + 2*(N1*i2 + i1);   // ptr to horiz line start in du
-      for (int l=0; l<2*ns; l++) {    // l is like dx but for ns interleaved
-        line[l] = ker2[0]*lptr[l];
+      const FLT *lptr = du + 2 * (N1 * i2 + i1); // ptr to horiz line start in du
+      for (int l = 0; l < 2 * ns; l++) {         // l is like dx but for ns interleaved
+        line[l] = ker2[0] * lptr[l];
       }
     }
     // add remaining const-y lines to the line (expensive inner loop)
-    for (int dy=1; dy<ns; dy++) {
-      const FLT *lptr = du + 2*(N1*(i2+dy) + i1);  // (see above)
-      for (int l=0; l<2*ns; ++l) {
-        line[l] += ker2[dy]*lptr[l];
+    for (int dy = 1; dy < ns; dy++) {
+      const FLT *lptr = du + 2 * (N1 * (i2 + dy) + i1); // (see above)
+      for (int l = 0; l < 2 * ns; ++l) {
+        line[l] += ker2[dy] * lptr[l];
       }
     }
     // apply x kernel to the (interleaved) line and add together
-    for (int dx=0; dx<ns; dx++) {
-      out[0] += line[2*dx]   * ker1[dx];
-      out[1] += line[2*dx+1] * ker1[dx];
+    for (int dx = 0; dx < ns; dx++) {
+      out[0] += line[2 * dx] * ker1[dx];
+      out[1] += line[2 * dx + 1] * ker1[dx];
     }
-  } else {                         // wraps somewhere: use ptr list
+  } else { // wraps somewhere: use ptr list
     // this is slower than above, but occurs much less often, with fractional
     // rate O(ns/min(N1,N2)). Thus this code doesn't need to be so optimized.
-    BIGINT j1[MAX_NSPREAD], j2[MAX_NSPREAD];   // 1d ptr lists
-    BIGINT x=i1, y=i2;                 // initialize coords
-    for (int d=0; d<ns; d++) {         // set up ptr lists
-      if (x<0) x+=N1;
-      if (x>=N1) x-=N1;
+    BIGINT j1[MAX_NSPREAD], j2[MAX_NSPREAD]; // 1d ptr lists
+    BIGINT x = i1, y = i2;                   // initialize coords
+    for (int d = 0; d < ns; d++) {           // set up ptr lists
+      if (x < 0) x += N1;
+      if (x >= N1) x -= N1;
       j1[d] = x++;
-      if (y<0) y+=N2;
-      if (y>=N2) y-=N2;
+      if (y < 0) y += N2;
+      if (y >= N2) y -= N2;
       j2[d] = y++;
     }
-    for (int dy=0; dy<ns; dy++) {      // use the pts lists
-      BIGINT oy = N1*j2[dy];           // offset due to y
-      for (int dx=0; dx<ns; dx++) {
-	FLT k = ker1[dx]*ker2[dy];
-	BIGINT j = oy + j1[dx];
-	out[0] += du[2*j] * k;
-	out[1] += du[2*j+1] * k;
+    for (int dy = 0; dy < ns; dy++) { // use the pts lists
+      BIGINT oy = N1 * j2[dy];        // offset due to y
+      for (int dx = 0; dx < ns; dx++) {
+        FLT k    = ker1[dx] * ker2[dy];
+        BIGINT j = oy + j1[dx];
+        out[0] += du[2 * j] * k;
+        out[1] += du[2 * j + 1] * k;
       }
     }
   }
   target[0] = out[0];
-  target[1] = out[1];  
+  target[1] = out[1];
 }
 
-void interp_cube(FLT *target,FLT *du, FLT *ker1, FLT *ker2, FLT *ker3,
-		 BIGINT i1,BIGINT i2,BIGINT i3, BIGINT N1,BIGINT N2,BIGINT N3,int ns)
+void interp_cube(FLT *target, FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT i1,
+                 BIGINT i2, BIGINT i3, BIGINT N1, BIGINT N2, BIGINT N3, int ns)
 /* 3D interpolate complex values from a ns*ns*ns block of the du (uniform grid
    data) array to a single complex output value "target", using as weights the
    ns*ns*ns outer product of the 1d kernel lists ker1, ker2, and ker3.
@@ -834,67 +858,68 @@ void interp_cube(FLT *target,FLT *du, FLT *ker1, FLT *ker2, FLT *ker3,
    (see above note in interp_square)
 */
 {
-  FLT out[] = {0.0, 0.0};  
-  if (i1>=0 && i1+ns<=N1 && i2>=0 && i2+ns<=N2 && i3>=0 && i3+ns<=N3) {
+  FLT out[] = {0.0, 0.0};
+  if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2 && i3 >= 0 && i3 + ns <= N3) {
     // no wrapping: avoid ptrs (by far the most common case)
-    FLT line[2*MAX_NSPREAD];       // store a horiz line (interleaved real,imag)
+    FLT line[2 * MAX_NSPREAD]; // store a horiz line (interleaved real,imag)
     // initialize line with zeros; hard to avoid here, but overhead small in 3D
-    for (int l=0; l<2*ns; l++) {
+    for (int l = 0; l < 2 * ns; l++) {
       line[l] = 0;
     }
     // co-add y and z contributions to line in x; do not apply x kernel yet
     // This is expensive innermost loop
-    for (int dz=0; dz<ns; dz++) {
-      BIGINT oz = N1*N2*(i3+dz);        // offset due to z
-      for (int dy=0; dy<ns; dy++) {
-        const FLT *lptr = du + 2*(oz + N1*(i2+dy) + i1);  // ptr start of line
-        FLT ker23 = ker2[dy]*ker3[dz];
-        for (int l=0; l<2*ns; ++l) {    // loop over ns interleaved (R,I) pairs
-          line[l] += lptr[l]*ker23;
+    for (int dz = 0; dz < ns; dz++) {
+      BIGINT oz = N1 * N2 * (i3 + dz);                         // offset due to z
+      for (int dy = 0; dy < ns; dy++) {
+        const FLT *lptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of
+                                                               // line
+        FLT ker23 = ker2[dy] * ker3[dz];
+        for (int l = 0; l < 2 * ns; ++l) { // loop over ns interleaved (R,I) pairs
+          line[l] += lptr[l] * ker23;
         }
       }
     }
     // apply x kernel to the (interleaved) line and add together (cheap)
-    for (int dx=0; dx<ns; dx++) {
-      out[0] += line[2*dx]   * ker1[dx];
-      out[1] += line[2*dx+1] * ker1[dx];
+    for (int dx = 0; dx < ns; dx++) {
+      out[0] += line[2 * dx] * ker1[dx];
+      out[1] += line[2 * dx + 1] * ker1[dx];
     }
-  } else {                         // wraps somewhere: use ptr list
+  } else { // wraps somewhere: use ptr list
     // ...can be slower since this case only happens with probability
     // O(ns/min(N1,N2,N3))
-    BIGINT j1[MAX_NSPREAD], j2[MAX_NSPREAD], j3[MAX_NSPREAD];   // 1d ptr lists
-    BIGINT x=i1, y=i2, z=i3;         // initialize coords
-    for (int d=0; d<ns; d++) {          // set up ptr lists
-      if (x<0) x+=N1;
-      if (x>=N1) x-=N1;
+    BIGINT j1[MAX_NSPREAD], j2[MAX_NSPREAD], j3[MAX_NSPREAD]; // 1d ptr lists
+    BIGINT x = i1, y = i2, z = i3;                            // initialize coords
+    for (int d = 0; d < ns; d++) {                            // set up ptr lists
+      if (x < 0) x += N1;
+      if (x >= N1) x -= N1;
       j1[d] = x++;
-      if (y<0) y+=N2;
-      if (y>=N2) y-=N2;
+      if (y < 0) y += N2;
+      if (y >= N2) y -= N2;
       j2[d] = y++;
-      if (z<0) z+=N3;
-      if (z>=N3) z-=N3;
+      if (z < 0) z += N3;
+      if (z >= N3) z -= N3;
       j3[d] = z++;
     }
-    for (int dz=0; dz<ns; dz++) {             // use the pts lists
-      BIGINT oz = N1*N2*j3[dz];               // offset due to z
-      for (int dy=0; dy<ns; dy++) {
-	BIGINT oy = oz + N1*j2[dy];           // offset due to y & z
-	FLT ker23 = ker2[dy]*ker3[dz];	
-	for (int dx=0; dx<ns; dx++) {
-	  FLT k = ker1[dx]*ker23;
-	  BIGINT j = oy + j1[dx];
-	  out[0] += du[2*j] * k;
-	  out[1] += du[2*j+1] * k;
-	}
+    for (int dz = 0; dz < ns; dz++) { // use the pts lists
+      BIGINT oz = N1 * N2 * j3[dz];   // offset due to z
+      for (int dy = 0; dy < ns; dy++) {
+        BIGINT oy = oz + N1 * j2[dy]; // offset due to y & z
+        FLT ker23 = ker2[dy] * ker3[dz];
+        for (int dx = 0; dx < ns; dx++) {
+          FLT k    = ker1[dx] * ker23;
+          BIGINT j = oy + j1[dx];
+          out[0] += du[2 * j] * k;
+          out[1] += du[2 * j + 1] * k;
+        }
       }
     }
   }
   target[0] = out[0];
-  target[1] = out[1];  
+  target[1] = out[1];
 }
 
-void spread_subproblem_1d(BIGINT off1, BIGINT size1,FLT *du,BIGINT M,
-			  FLT *kx,FLT *dd, const finufft_spread_opts& opts)
+void spread_subproblem_1d(BIGINT off1, BIGINT size1, FLT *du, BIGINT M, FLT *kx, FLT *dd,
+                          const finufft_spread_opts &opts)
 /* 1D spreader from nonuniform to uniform subproblem grid, without wrapping.
    Inputs:
    off1 - integer offset of left end of du subgrid from that of overall fine
@@ -915,42 +940,42 @@ void spread_subproblem_1d(BIGINT off1, BIGINT size1,FLT *du,BIGINT M,
    This needed off1 as extra arg. AHB 11/30/20.
 */
 {
-  int ns=opts.nspread;          // a.k.a. w
-  FLT ns2 = (FLT)ns/2;          // half spread width
-  for (BIGINT i=0;i<2*size1;++i)         // zero output
+  int ns  = opts.nspread;                // a.k.a. w
+  FLT ns2 = (FLT)ns / 2;                 // half spread width
+  for (BIGINT i = 0; i < 2 * size1; ++i) // zero output
     du[i] = 0.0;
   FLT kernel_args[MAX_NSPREAD];
   FLT ker[MAX_NSPREAD];
-  for (BIGINT i=0; i<M; i++) {           // loop over NU pts
-    FLT re0 = dd[2*i];
-    FLT im0 = dd[2*i+1];
+  for (BIGINT i = 0; i < M; i++) { // loop over NU pts
+    FLT re0 = dd[2 * i];
+    FLT im0 = dd[2 * i + 1];
     // ceil offset, hence rounding, must match that in get_subgrid...
-    BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2);    // fine grid start index
-    FLT x1 = (FLT)i1 - kx[i];            // x1 in [-w/2,-w/2+1], up to rounding
+    BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2); // fine grid start index
+    FLT x1    = (FLT)i1 - kx[i];                // x1 in [-w/2,-w/2+1], up to rounding
     // However if N1*epsmach>O(1) then can cause O(1) errors in x1, hence ppoly
     // kernel evaluation will fall outside their designed domains, >>1 errors.
     // This can only happen if the overall error would be O(1) anyway. Clip x1??
-    if (x1<-ns2) x1=-ns2;
-    if (x1>-ns2+1) x1=-ns2+1;   // ***
-    if (opts.kerevalmeth==0) {          // faster Horner poly method
+    if (x1 < -ns2) x1 = -ns2;
+    if (x1 > -ns2 + 1) x1 = -ns2 + 1; // ***
+    if (opts.kerevalmeth == 0) {      // faster Horner poly method
       set_kernel_args(kernel_args, x1, opts);
       evaluate_kernel_vector(ker, kernel_args, opts, ns);
     } else
-      eval_kernel_vec_Horner(ker,x1,ns,opts);
-    BIGINT j = i1-off1;    // offset rel to subgrid, starts the output indices
+      eval_kernel_vec_Horner(ker, x1, ns, opts);
+    BIGINT j = i1 - off1; // offset rel to subgrid, starts the output indices
     // critical inner loop:
-    for (int dx=0; dx<ns; ++dx) {
+    for (int dx = 0; dx < ns; ++dx) {
       FLT k = ker[dx];
-      du[2*j] += re0*k;
-      du[2*j+1] += im0*k;
+      du[2 * j] += re0 * k;
+      du[2 * j + 1] += im0 * k;
       ++j;
     }
   }
 }
 
-void spread_subproblem_2d(BIGINT off1,BIGINT off2,BIGINT size1,BIGINT size2,
-                          FLT *du,BIGINT M, FLT *kx,FLT *ky,FLT *dd,
-			  const finufft_spread_opts& opts)
+void spread_subproblem_2d(BIGINT off1, BIGINT off2, BIGINT size1, BIGINT size2, FLT *du,
+                          BIGINT M, FLT *kx, FLT *ky, FLT *dd,
+                          const finufft_spread_opts &opts)
 /* spreader from dd (NU) to du (uniform) in 2D without wrapping.
    See above docs/notes for spread_subproblem_2d.
    kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims.
@@ -958,56 +983,55 @@ void spread_subproblem_2d(BIGINT off1,BIGINT off2,BIGINT size1,BIGINT size2,
    du (size size1*size2) is complex uniform output array
  */
 {
-  int ns=opts.nspread;
-  FLT ns2 = (FLT)ns/2;          // half spread width
-  for (BIGINT i=0;i<2*size1*size2;++i)
-    du[i] = 0.0;
-  FLT kernel_args[2*MAX_NSPREAD];
+  int ns  = opts.nspread;
+  FLT ns2 = (FLT)ns / 2; // half spread width
+  for (BIGINT i = 0; i < 2 * size1 * size2; ++i) du[i] = 0.0;
+  FLT kernel_args[2 * MAX_NSPREAD];
   // Kernel values stored in consecutive memory. This allows us to compute
   // values in two directions in a single kernel evaluation call.
-  FLT kernel_values[2*MAX_NSPREAD];
+  FLT kernel_values[2 * MAX_NSPREAD];
   FLT *ker1 = kernel_values;
-  FLT *ker2 = kernel_values + ns;  
-  for (BIGINT i=0; i<M; i++) {           // loop over NU pts
-    FLT re0 = dd[2*i];
-    FLT im0 = dd[2*i+1];
+  FLT *ker2 = kernel_values + ns;
+  for (BIGINT i = 0; i < M; i++) { // loop over NU pts
+    FLT re0 = dd[2 * i];
+    FLT im0 = dd[2 * i + 1];
     // ceil offset, hence rounding, must match that in get_subgrid...
-    BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2);   // fine grid start indices
+    BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2); // fine grid start indices
     BIGINT i2 = (BIGINT)std::ceil(ky[i] - ns2);
-    FLT x1 = (FLT)i1 - kx[i];
-    FLT x2 = (FLT)i2 - ky[i];
-    if (opts.kerevalmeth==0) {          // faster Horner poly method
+    FLT x1    = (FLT)i1 - kx[i];
+    FLT x2    = (FLT)i2 - ky[i];
+    if (opts.kerevalmeth == 0) { // faster Horner poly method
       set_kernel_args(kernel_args, x1, opts);
-      set_kernel_args(kernel_args+ns, x2, opts);
-      evaluate_kernel_vector(kernel_values, kernel_args, opts, 2*ns);
+      set_kernel_args(kernel_args + ns, x2, opts);
+      evaluate_kernel_vector(kernel_values, kernel_args, opts, 2 * ns);
     } else {
-      // due to ns being padded up to next multiple of 4 in the eval_kernel_vec_Horner
-      // and writing zeros out to this padded size, these must occur in the order x,y...
-      eval_kernel_vec_Horner(ker1,x1,ns,opts);
-      eval_kernel_vec_Horner(ker2,x2,ns,opts);
+      // due to ns being padded up to next multiple of 4 in the
+      // eval_kernel_vec_Horner and writing zeros out to this padded size, these
+      // must occur in the order x,y...
+      eval_kernel_vec_Horner(ker1, x1, ns, opts);
+      eval_kernel_vec_Horner(ker2, x2, ns, opts);
     }
     // Combine kernel with complex source value to simplify inner loop
-    FLT ker1val[2*MAX_NSPREAD];    // here 2* is because of complex
+    FLT ker1val[2 * MAX_NSPREAD]; // here 2* is because of complex
     for (int i = 0; i < ns; i++) {
-      ker1val[2*i] = re0*ker1[i];
-      ker1val[2*i+1] = im0*ker1[i];
-    }    
+      ker1val[2 * i]     = re0 * ker1[i];
+      ker1val[2 * i + 1] = im0 * ker1[i];
+    }
     // critical inner loop:
-    for (int dy=0; dy<ns; ++dy) {
-      BIGINT j = size1*(i2-off2+dy) + i1-off1;   // should be in subgrid
+    for (int dy = 0; dy < ns; ++dy) {
+      BIGINT j   = size1 * (i2 - off2 + dy) + i1 - off1; // should be in subgrid
       FLT kerval = ker2[dy];
-      FLT *trg = du+2*j;
-      for (int dx=0; dx<2*ns; ++dx) {
-	trg[dx] += kerval*ker1val[dx];
-      }	
+      FLT *trg   = du + 2 * j;
+      for (int dx = 0; dx < 2 * ns; ++dx) {
+        trg[dx] += kerval * ker1val[dx];
+      }
     }
   }
 }
 
-void spread_subproblem_3d(BIGINT off1,BIGINT off2,BIGINT off3,BIGINT size1,
-                          BIGINT size2,BIGINT size3,FLT *du,BIGINT M,
-			  FLT *kx,FLT *ky,FLT *kz,FLT *dd,
-			  const finufft_spread_opts& opts)
+void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, BIGINT size1,
+                          BIGINT size2, BIGINT size3, FLT *du, BIGINT M, FLT *kx, FLT *ky,
+                          FLT *kz, FLT *dd, const finufft_spread_opts &opts)
 /* spreader from dd (NU) to du (uniform) in 3D without wrapping.
    See above docs/notes for spread_subproblem_2d.
    kx,ky,kz (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in each dim.
@@ -1015,63 +1039,64 @@ void spread_subproblem_3d(BIGINT off1,BIGINT off2,BIGINT off3,BIGINT size1,
    du (size size1*size2*size3) is uniform complex output array
  */
 {
-  int ns=opts.nspread;
-  FLT ns2 = (FLT)ns/2;          // half spread width
-  for (BIGINT i=0;i<2*size1*size2*size3;++i)
-    du[i] = 0.0;
-  FLT kernel_args[3*MAX_NSPREAD];
+  int ns  = opts.nspread;
+  FLT ns2 = (FLT)ns / 2; // half spread width
+  for (BIGINT i = 0; i < 2 * size1 * size2 * size3; ++i) du[i] = 0.0;
+  FLT kernel_args[3 * MAX_NSPREAD];
   // Kernel values stored in consecutive memory. This allows us to compute
   // values in all three directions in a single kernel evaluation call.
-  FLT kernel_values[3*MAX_NSPREAD];
+  FLT kernel_values[3 * MAX_NSPREAD];
   FLT *ker1 = kernel_values;
   FLT *ker2 = kernel_values + ns;
-  FLT *ker3 = kernel_values + 2*ns;  
-  for (BIGINT i=0; i<M; i++) {           // loop over NU pts
-    FLT re0 = dd[2*i];
-    FLT im0 = dd[2*i+1];
+  FLT *ker3 = kernel_values + 2 * ns;
+  for (BIGINT i = 0; i < M; i++) { // loop over NU pts
+    FLT re0 = dd[2 * i];
+    FLT im0 = dd[2 * i + 1];
     // ceil offset, hence rounding, must match that in get_subgrid...
-    BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2);   // fine grid start indices
+    BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2); // fine grid start indices
     BIGINT i2 = (BIGINT)std::ceil(ky[i] - ns2);
     BIGINT i3 = (BIGINT)std::ceil(kz[i] - ns2);
-    FLT x1 = (FLT)i1 - kx[i];
-    FLT x2 = (FLT)i2 - ky[i];
-    FLT x3 = (FLT)i3 - kz[i];
-    if (opts.kerevalmeth==0) {          // faster Horner poly method
+    FLT x1    = (FLT)i1 - kx[i];
+    FLT x2    = (FLT)i2 - ky[i];
+    FLT x3    = (FLT)i3 - kz[i];
+    if (opts.kerevalmeth == 0) { // faster Horner poly method
       set_kernel_args(kernel_args, x1, opts);
-      set_kernel_args(kernel_args+ns, x2, opts);
-      set_kernel_args(kernel_args+2*ns, x3, opts);
-      evaluate_kernel_vector(kernel_values, kernel_args, opts, 3*ns);
+      set_kernel_args(kernel_args + ns, x2, opts);
+      set_kernel_args(kernel_args + 2 * ns, x3, opts);
+      evaluate_kernel_vector(kernel_values, kernel_args, opts, 3 * ns);
     } else {
-      // due to ns being padded up to next multiple of 4 in the eval_kernel_vec_Horner
-      // and writing zeros out to this padded size, these must occur in the order x,y,z...
-      eval_kernel_vec_Horner(ker1,x1,ns,opts);
-      eval_kernel_vec_Horner(ker2,x2,ns,opts);
-      eval_kernel_vec_Horner(ker3,x3,ns,opts);
+      // due to ns being padded up to next multiple of 4 in the
+      // eval_kernel_vec_Horner and writing zeros out to this padded size, these
+      // must occur in the order x,y,z...
+      eval_kernel_vec_Horner(ker1, x1, ns, opts);
+      eval_kernel_vec_Horner(ker2, x2, ns, opts);
+      eval_kernel_vec_Horner(ker3, x3, ns, opts);
     }
     // Combine kernel with complex source value to simplify inner loop
-    FLT ker1val[2*MAX_NSPREAD];    // here 2* is because of complex
+    FLT ker1val[2 * MAX_NSPREAD]; // here 2* is because of complex
     for (int i = 0; i < ns; i++) {
-      ker1val[2*i] = re0*ker1[i];
-      ker1val[2*i+1] = im0*ker1[i];	
-    }    
+      ker1val[2 * i]     = re0 * ker1[i];
+      ker1val[2 * i + 1] = im0 * ker1[i];
+    }
     // critical inner loop:
-    for (int dz=0; dz<ns; ++dz) {
-      BIGINT oz = size1*size2*(i3-off3+dz);        // offset due to z
-      for (int dy=0; dy<ns; ++dy) {
-	BIGINT j = oz + size1*(i2-off2+dy) + i1-off1;   // should be in subgrid
-	FLT kerval = ker2[dy]*ker3[dz];
-	FLT *trg = du+2*j;
-	for (int dx=0; dx<2*ns; ++dx) {
-	  trg[dx] += kerval*ker1val[dx];
-	}	
+    for (int dz = 0; dz < ns; ++dz) {
+      BIGINT oz = size1 * size2 * (i3 - off3 + dz);           // offset due to z
+      for (int dy = 0; dy < ns; ++dy) {
+        BIGINT j = oz + size1 * (i2 - off2 + dy) + i1 - off1; // should be in
+                                                              // subgrid
+        FLT kerval = ker2[dy] * ker3[dz];
+        FLT *trg   = du + 2 * j;
+        for (int dx = 0; dx < 2 * ns; ++dx) {
+          trg[dx] += kerval * ker1val[dx];
+        }
       }
     }
   }
 }
 
-void add_wrapped_subgrid(BIGINT offset1,BIGINT offset2,BIGINT offset3,
-			 BIGINT size1,BIGINT size2,BIGINT size3,BIGINT N1,
-			 BIGINT N2,BIGINT N3,FLT *data_uniform, FLT *du0)
+void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT size1,
+                         BIGINT size2, BIGINT size3, BIGINT N1, BIGINT N2, BIGINT N3,
+                         FLT *data_uniform, FLT *du0)
 /* Add a large subgrid (du0) to output grid (data_uniform),
    with periodic wrapping to N1,N2,N3 box.
    offset1,2,3 give the offset of the subgrid from the lowest corner of output.
@@ -1081,42 +1106,40 @@ void add_wrapped_subgrid(BIGINT offset1,BIGINT offset2,BIGINT offset3,
 */
 {
   std::vector<BIGINT> o2(size2), o3(size3);
-  BIGINT y=offset2, z=offset3;    // fill wrapped ptr lists in slower dims y,z...
-  for (int i=0; i<size2; ++i) {
-    if (y<0) y+=N2;
-    if (y>=N2) y-=N2;
+  BIGINT y = offset2, z = offset3; // fill wrapped ptr lists in slower dims y,z...
+  for (int i = 0; i < size2; ++i) {
+    if (y < 0) y += N2;
+    if (y >= N2) y -= N2;
     o2[i] = y++;
   }
-  for (int i=0; i<size3; ++i) {
-    if (z<0) z+=N3;
-    if (z>=N3) z-=N3;
+  for (int i = 0; i < size3; ++i) {
+    if (z < 0) z += N3;
+    if (z >= N3) z -= N3;
     o3[i] = z++;
   }
-  BIGINT nlo = (offset1<0) ? -offset1 : 0;          // # wrapping below in x
-  BIGINT nhi = (offset1+size1>N1) ? offset1+size1-N1 : 0;    // " above in x
+  BIGINT nlo = (offset1 < 0) ? -offset1 : 0;                      // # wrapping below in x
+  BIGINT nhi = (offset1 + size1 > N1) ? offset1 + size1 - N1 : 0; // " above in x
   // this triple loop works in all dims
-  for (int dz=0; dz<size3; dz++) {       // use ptr lists in each axis
-    BIGINT oz = N1*N2*o3[dz];            // offset due to z (0 in <3D)
-    for (int dy=0; dy<size2; dy++) {
-      BIGINT oy = oz + N1*o2[dy];        // off due to y & z (0 in 1D)
-      FLT *out = data_uniform + 2*oy;
-      FLT *in  = du0 + 2*size1*(dy + size2*dz);   // ptr to subgrid array
-      BIGINT o = 2*(offset1+N1);         // 1d offset for output
-      for (int j=0; j<2*nlo; j++)        // j is really dx/2 (since re,im parts)
-	out[j+o] += in[j];
-      o = 2*offset1;
-      for (int j=2*nlo; j<2*(size1-nhi); j++)
-	out[j+o] += in[j];
-      o = 2*(offset1-N1);
-      for (int j=2*(size1-nhi); j<2*size1; j++)
-      	out[j+o] += in[j];
+  for (int dz = 0; dz < size3; dz++) {                 // use ptr lists in each axis
+    BIGINT oz = N1 * N2 * o3[dz];                      // offset due to z (0 in <3D)
+    for (int dy = 0; dy < size2; dy++) {
+      BIGINT oy = oz + N1 * o2[dy];                    // off due to y & z (0 in 1D)
+      FLT *out  = data_uniform + 2 * oy;
+      FLT *in   = du0 + 2 * size1 * (dy + size2 * dz); // ptr to subgrid array
+      BIGINT o  = 2 * (offset1 + N1);                  // 1d offset for output
+      for (int j = 0; j < 2 * nlo; j++) // j is really dx/2 (since re,im parts)
+        out[j + o] += in[j];
+      o = 2 * offset1;
+      for (int j = 2 * nlo; j < 2 * (size1 - nhi); j++) out[j + o] += in[j];
+      o = 2 * (offset1 - N1);
+      for (int j = 2 * (size1 - nhi); j < 2 * size1; j++) out[j + o] += in[j];
     }
   }
 }
 
-void add_wrapped_subgrid_thread_safe(BIGINT offset1,BIGINT offset2,BIGINT offset3,
-                                     BIGINT size1,BIGINT size2,BIGINT size3,BIGINT N1,
-                                     BIGINT N2,BIGINT N3,FLT *data_uniform, FLT *du0)
+void add_wrapped_subgrid_thread_safe(BIGINT offset1, BIGINT offset2, BIGINT offset3,
+                                     BIGINT size1, BIGINT size2, BIGINT size3, BIGINT N1,
+                                     BIGINT N2, BIGINT N3, FLT *data_uniform, FLT *du0)
 /* Add a large subgrid (du0) to output grid (data_uniform),
    with periodic wrapping to N1,N2,N3 box.
    offset1,2,3 give the offset of the subgrid from the lowest corner of output.
@@ -1126,49 +1149,48 @@ void add_wrapped_subgrid_thread_safe(BIGINT offset1,BIGINT offset2,BIGINT offset
 */
 {
   std::vector<BIGINT> o2(size2), o3(size3);
-  BIGINT y=offset2, z=offset3;    // fill wrapped ptr lists in slower dims y,z...
-  for (int i=0; i<size2; ++i) {
-    if (y<0) y+=N2;
-    if (y>=N2) y-=N2;
+  BIGINT y = offset2, z = offset3; // fill wrapped ptr lists in slower dims y,z...
+  for (int i = 0; i < size2; ++i) {
+    if (y < 0) y += N2;
+    if (y >= N2) y -= N2;
     o2[i] = y++;
   }
-  for (int i=0; i<size3; ++i) {
-    if (z<0) z+=N3;
-    if (z>=N3) z-=N3;
+  for (int i = 0; i < size3; ++i) {
+    if (z < 0) z += N3;
+    if (z >= N3) z -= N3;
     o3[i] = z++;
   }
-  BIGINT nlo = (offset1<0) ? -offset1 : 0;          // # wrapping below in x
-  BIGINT nhi = (offset1+size1>N1) ? offset1+size1-N1 : 0;    // " above in x
+  BIGINT nlo = (offset1 < 0) ? -offset1 : 0;                      // # wrapping below in x
+  BIGINT nhi = (offset1 + size1 > N1) ? offset1 + size1 - N1 : 0; // " above in x
   // this triple loop works in all dims
-  for (int dz=0; dz<size3; dz++) {       // use ptr lists in each axis
-    BIGINT oz = N1*N2*o3[dz];            // offset due to z (0 in <3D)
-    for (int dy=0; dy<size2; dy++) {
-      BIGINT oy = oz + N1*o2[dy];        // off due to y & z (0 in 1D)
-      FLT *out = data_uniform + 2*oy;
-      FLT *in  = du0 + 2*size1*(dy + size2*dz);   // ptr to subgrid array
-      BIGINT o = 2*(offset1+N1);         // 1d offset for output
-      for (int j=0; j<2*nlo; j++) { // j is really dx/2 (since re,im parts)
+  for (int dz = 0; dz < size3; dz++) {                 // use ptr lists in each axis
+    BIGINT oz = N1 * N2 * o3[dz];                      // offset due to z (0 in <3D)
+    for (int dy = 0; dy < size2; dy++) {
+      BIGINT oy = oz + N1 * o2[dy];                    // off due to y & z (0 in 1D)
+      FLT *out  = data_uniform + 2 * oy;
+      FLT *in   = du0 + 2 * size1 * (dy + size2 * dz); // ptr to subgrid array
+      BIGINT o  = 2 * (offset1 + N1);                  // 1d offset for output
+      for (int j = 0; j < 2 * nlo; j++) { // j is really dx/2 (since re,im parts)
 #pragma omp atomic
         out[j + o] += in[j];
       }
-      o = 2*offset1;
-      for (int j=2*nlo; j<2*(size1-nhi); j++) {
+      o = 2 * offset1;
+      for (int j = 2 * nlo; j < 2 * (size1 - nhi); j++) {
 #pragma omp atomic
         out[j + o] += in[j];
       }
-      o = 2*(offset1-N1);
-      for (int j=2*(size1-nhi); j<2*size1; j++) {
+      o = 2 * (offset1 - N1);
+      for (int j = 2 * (size1 - nhi); j < 2 * size1; j++) {
 #pragma omp atomic
-        out[j+o] += in[j];
+        out[j + o] += in[j];
       }
     }
   }
 }
 
-
-void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-	      BIGINT N1,BIGINT N2,BIGINT N3,
-	      double bin_size_x,double bin_size_y,double bin_size_z, int debug)
+void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGINT N1,
+                           BIGINT N2, BIGINT N3, double bin_size_x, double bin_size_y,
+                           double bin_size_z, int debug)
 /* Returns permutation of all nonuniform points with good RAM access,
  * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version
  *
@@ -1177,7 +1199,7 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
  * these bins in a Cartesian cuboid ordering (x fastest, y med, z slowest).
  * Finally the permutation is inverted, so that the good ordering is: the
  * NU pt of index ret[0], the NU pt of index ret[1],..., NU pt of index ret[M-1]
- * 
+ *
  * Inputs: M - number of input NU points.
  *         kx,ky,kz - length-M arrays of real coords of NU pts in [-pi, pi).
  *                    Points outside this range are folded into it.
@@ -1197,48 +1219,47 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
  * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed).
  */
 {
-  bool isky=(N2>1), iskz=(N3>1);  // ky,kz avail? (cannot access if not)
+  bool isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
   // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x,
   // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1).
   // Note that round-off near kx=-pi stably rounds negative to i1=0.
-  BIGINT nbins1=N1/bin_size_x+1, nbins2, nbins3;
-  nbins2 = isky ? N2/bin_size_y+1 : 1;
-  nbins3 = iskz ? N3/bin_size_z+1 : 1;
-  BIGINT nbins = nbins1*nbins2*nbins3;
+  BIGINT nbins1 = N1 / bin_size_x + 1, nbins2, nbins3;
+  nbins2        = isky ? N2 / bin_size_y + 1 : 1;
+  nbins3        = iskz ? N3 / bin_size_z + 1 : 1;
+  BIGINT nbins  = nbins1 * nbins2 * nbins3;
 
-  std::vector<BIGINT> counts(nbins,0);  // count how many pts in each bin
-  for (BIGINT i=0; i<M; i++) {
+  std::vector<BIGINT> counts(nbins, 0); // count how many pts in each bin
+  for (BIGINT i = 0; i < M; i++) {
     // find the bin index in however many dims are needed
-    BIGINT i1= fold_rescale(kx[i], N1) / bin_size_x, i2=0, i3=0;
+    BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0;
     if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y;
     if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z;
-    BIGINT bin = i1+nbins1*(i2+nbins2*i3);
+    BIGINT bin = i1 + nbins1 * (i2 + nbins2 * i3);
     counts[bin]++;
   }
 
   // compute the offsets directly in the counts array (no offset array)
-  BIGINT current_offset=0;
-  for (BIGINT i=0; i<nbins; i++) {
+  BIGINT current_offset = 0;
+  for (BIGINT i = 0; i < nbins; i++) {
     BIGINT tmp = counts[i];
-    counts[i] = current_offset;   // Reinecke's cute replacement of counts[i]
+    counts[i]  = current_offset; // Reinecke's cute replacement of counts[i]
     current_offset += tmp;
-  }              // (counts now contains the index offsets for each bin)
-  
-  for (BIGINT i=0; i<M; i++) {
+  } // (counts now contains the index offsets for each bin)
+
+  for (BIGINT i = 0; i < M; i++) {
     // find the bin index (again! but better than using RAM)
-    BIGINT i1= fold_rescale(kx[i], N1) / bin_size_x, i2=0, i3=0;
+    BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0;
     if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y;
     if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z;
-    BIGINT bin = i1+nbins1*(i2+nbins2*i3);
-    ret[counts[bin]] = i;      // fill the inverse map on the fly
-    ++counts[bin];             // update the offsets
+    BIGINT bin       = i1 + nbins1 * (i2 + nbins2 * i3);
+    ret[counts[bin]] = i; // fill the inverse map on the fly
+    ++counts[bin];        // update the offsets
   }
 }
 
-void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
-	      BIGINT N1,BIGINT N2,BIGINT N3,
-              double bin_size_x,double bin_size_y,double bin_size_z, int debug,
-              int nthr)
+void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGINT N1,
+                          BIGINT N2, BIGINT N3, double bin_size_x, double bin_size_y,
+                          double bin_size_z, int debug, int nthr)
 /* Mostly-OpenMP'ed version of bin_sort.
    For documentation see: bin_sort_singlethread.
    Caution: when M (# NU pts) << N (# U pts), is SLOWER than single-thread.
@@ -1248,66 +1269,67 @@ void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
    Todo: if debug, print timing breakdowns.
  */
 {
-  bool isky=(N2>1), iskz=(N3>1);  // ky,kz avail? (cannot access if not)
-  BIGINT nbins1=N1/bin_size_x+1, nbins2, nbins3;  // see above note on why +1
-  nbins2 = isky ? N2/bin_size_y+1 : 1;
-  nbins3 = iskz ? N3/bin_size_z+1 : 1;
-  BIGINT nbins = nbins1*nbins2*nbins3;
-  if (nthr==0)                      // should never happen in spreadinterp use
-    fprintf(stderr,"[%s] nthr (%d) must be positive!\n",__func__,nthr);
-  int nt = min(M,(BIGINT)nthr);     // handle case of less points than threads
-  std::vector<BIGINT> brk(nt+1);    // list of start NU pt indices per thread
+  bool isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not)
+  BIGINT nbins1 = N1 / bin_size_x + 1, nbins2, nbins3; // see above note on why +1
+  nbins2        = isky ? N2 / bin_size_y + 1 : 1;
+  nbins3        = iskz ? N3 / bin_size_z + 1 : 1;
+  BIGINT nbins  = nbins1 * nbins2 * nbins3;
+  if (nthr == 0)                   // should never happen in spreadinterp use
+    fprintf(stderr, "[%s] nthr (%d) must be positive!\n", __func__, nthr);
+  int nt = min(M, (BIGINT)nthr);   // handle case of less points than threads
+  std::vector<BIGINT> brk(nt + 1); // list of start NU pt indices per thread
 
   // distribute the NU pts to threads once & for all...
-  for (int t=0; t<=nt; ++t)
-    brk[t] = (BIGINT)(0.5 + M*t/(double)nt);   // start index for t'th chunk
+  for (int t = 0; t <= nt; ++t)
+    brk[t] = (BIGINT)(0.5 + M * t / (double)nt); // start index for t'th chunk
 
   // set up 2d array (nthreads * nbins), just its pointers for now
   // (sub-vectors will be initialized later)
-  std::vector< std::vector<BIGINT> > counts(nt);
-    
+  std::vector<std::vector<BIGINT>> counts(nt);
+
 #pragma omp parallel num_threads(nt)
-  {  // parallel binning to each thread's count. Block done once per thread
-    int t = MY_OMP_GET_THREAD_NUM();     // (we assume all nt threads created)
-    auto &my_counts(counts[t]);          // name for counts[t]
-    my_counts.resize(nbins,0);  // allocate counts[t], now in parallel region
-    for (BIGINT i=brk[t]; i<brk[t+1]; i++) {
+  { // parallel binning to each thread's count. Block done once per thread
+    int t = MY_OMP_GET_THREAD_NUM(); // (we assume all nt threads created)
+    auto &my_counts(counts[t]);      // name for counts[t]
+    my_counts.resize(nbins, 0);      // allocate counts[t], now in parallel region
+    for (BIGINT i = brk[t]; i < brk[t + 1]; i++) {
       // find the bin index in however many dims are needed
-      BIGINT i1= fold_rescale(kx[i], N1) / bin_size_x, i2=0, i3=0;
+      BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0;
       if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y;
       if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z;
-      BIGINT bin = i1+nbins1*(i2+nbins2*i3);
-      ++my_counts[bin];               // no clash btw threads
+      BIGINT bin = i1 + nbins1 * (i2 + nbins2 * i3);
+      ++my_counts[bin]; // no clash btw threads
     }
   }
-  
+
   // inner sum along both bin and thread (inner) axes to get global offsets
   BIGINT current_offset = 0;
-  for (BIGINT b=0; b<nbins; ++b)   // (not worth omp)
-    for (int t=0; t<nt; ++t) {
-      BIGINT tmp = counts[t][b];
+  for (BIGINT b = 0; b < nbins; ++b) // (not worth omp)
+    for (int t = 0; t < nt; ++t) {
+      BIGINT tmp   = counts[t][b];
       counts[t][b] = current_offset;
       current_offset += tmp;
-    }   // counts[t][b] is now the index offset as if t ordered fast, b slow
-  
+    } // counts[t][b] is now the index offset as if t ordered fast, b slow
+
 #pragma omp parallel num_threads(nt)
   {
     int t = MY_OMP_GET_THREAD_NUM();
     auto &my_counts(counts[t]);
-    for (BIGINT i=brk[t]; i<brk[t+1]; i++) {
+    for (BIGINT i = brk[t]; i < brk[t + 1]; i++) {
       // find the bin index (again! but better than using RAM)
-      BIGINT i1= fold_rescale(kx[i], N1) / bin_size_x, i2=0, i3=0;
+      BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0;
       if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y;
       if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z;
-      BIGINT bin = i1+nbins1*(i2+nbins2*i3);
-      ret[my_counts[bin]] = i;   // inverse is offset for this NU pt and thread
-      ++my_counts[bin];          // update the offsets; no thread clash
+      BIGINT bin          = i1 + nbins1 * (i2 + nbins2 * i3);
+      ret[my_counts[bin]] = i; // inverse is offset for this NU pt and thread
+      ++my_counts[bin];        // update the offsets; no thread clash
     }
   }
 }
 
-
-void get_subgrid(BIGINT &offset1,BIGINT &offset2,BIGINT &offset3,BIGINT &size1,BIGINT &size2,BIGINT &size3,BIGINT M,FLT* kx,FLT* ky,FLT* kz,int ns,int ndims)
+void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &size1,
+                 BIGINT &size2, BIGINT &size3, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
+                 int ns, int ndims)
 /* Writes out the integer offsets and sizes of a "subgrid" (cuboid subset of
    Z^ndims) large enough to enclose all of the nonuniform points with
    (non-periodic) padding of half the kernel width ns to each side in
@@ -1321,7 +1343,7 @@ void get_subgrid(BIGINT &offset1,BIGINT &offset2,BIGINT &offset3,BIGINT &size1,B
               assumed to be in [0,Nj] for dimension j=1,..,ndims.
    ns - (positive integer) spreading kernel width.
    ndims - space dimension (1,2, or 3).
-   
+
  Outputs:
    offset1,2,3 - left-most coord of cuboid in each dimension (up to ndims)
    size1,2,3   - size of cuboid in each dimension.
@@ -1351,41 +1373,41 @@ void get_subgrid(BIGINT &offset1,BIGINT &offset2,BIGINT &offset3,BIGINT &size1,B
    tests.
 */
 {
-  FLT ns2 = (FLT)ns/2;
-  FLT min_kx,max_kx;   // 1st (x) dimension: get min/max of nonuniform points
-  arrayrange(M,kx,&min_kx,&max_kx);
-  offset1 = (BIGINT)std::ceil(min_kx-ns2);   // min index touched by kernel
-  size1 = (BIGINT)std::ceil(max_kx-ns2) - offset1 + ns;  // int(ceil) first!
-  if (ndims>1) {
-    FLT min_ky,max_ky;   // 2nd (y) dimension: get min/max of nonuniform points
-    arrayrange(M,ky,&min_ky,&max_ky);
-    offset2 = (BIGINT)std::ceil(min_ky-ns2);
-    size2 = (BIGINT)std::ceil(max_ky-ns2) - offset2 + ns;
+  FLT ns2 = (FLT)ns / 2;
+  FLT min_kx, max_kx; // 1st (x) dimension: get min/max of nonuniform points
+  arrayrange(M, kx, &min_kx, &max_kx);
+  offset1 = (BIGINT)std::ceil(min_kx - ns2);                // min index touched by kernel
+  size1   = (BIGINT)std::ceil(max_kx - ns2) - offset1 + ns; // int(ceil) first!
+  if (ndims > 1) {
+    FLT min_ky, max_ky; // 2nd (y) dimension: get min/max of nonuniform points
+    arrayrange(M, ky, &min_ky, &max_ky);
+    offset2 = (BIGINT)std::ceil(min_ky - ns2);
+    size2   = (BIGINT)std::ceil(max_ky - ns2) - offset2 + ns;
   } else {
     offset2 = 0;
-    size2 = 1;
+    size2   = 1;
   }
-  if (ndims>2) {
-    FLT min_kz,max_kz;   // 3rd (z) dimension: get min/max of nonuniform points
-    arrayrange(M,kz,&min_kz,&max_kz);
-    offset3 = (BIGINT)std::ceil(min_kz-ns2);
-    size3 = (BIGINT)std::ceil(max_kz-ns2) - offset3 + ns;
+  if (ndims > 2) {
+    FLT min_kz, max_kz; // 3rd (z) dimension: get min/max of nonuniform points
+    arrayrange(M, kz, &min_kz, &max_kz);
+    offset3 = (BIGINT)std::ceil(min_kz - ns2);
+    size3   = (BIGINT)std::ceil(max_kz - ns2) - offset3 + ns;
   } else {
     offset3 = 0;
-    size3 = 1;
+    size3   = 1;
   }
 }
 /* local NU coord fold+rescale macro: does the following affine transform to x:
      when p=true:   (x+PI) mod PI    each to [0,N)
      otherwise,     x mod N          each to [0,N)
    Note: folding big numbers can cause numerical inaccuracies
-   Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range limitation
-   Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function
+   Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range
+   limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function
 */
 FINUFFT_ALWAYS_INLINE FLT fold_rescale(const FLT x, const BIGINT N) noexcept {
   static constexpr const FLT x2pi = FLT(M_1_2PI);
-  const FLT result = x * x2pi + FLT(0.5);
-  return (result-floor(result)) * FLT(N);
+  const FLT result                = x * x2pi + FLT(0.5);
+  return (result - floor(result)) * FLT(N);
 }
-}   // namespace
-}   // namespace
+} // namespace spreadinterp
+} // namespace finufft
diff --git a/src/utils.cpp b/src/utils.cpp
index 92f4035eb..8df6ed665 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -7,80 +7,80 @@
 #include "finufft/defs.h"
 
 namespace finufft {
-  namespace utils {
+namespace utils {
 
 // ------------ complex array utils ---------------------------------
 
-FLT relerrtwonorm(BIGINT n, CPX* a, CPX* b)
+FLT relerrtwonorm(BIGINT n, CPX *a, CPX *b)
 // ||a-b||_2 / ||a||_2
 {
   FLT err = 0.0, nrm = 0.0;
-  for (BIGINT m=0; m<n; ++m) {
-    nrm += real(conj(a[m])*a[m]);
-    CPX diff = a[m]-b[m];
-    err += real(conj(diff)*diff);
+  for (BIGINT m = 0; m < n; ++m) {
+    nrm += real(conj(a[m]) * a[m]);
+    CPX diff = a[m] - b[m];
+    err += real(conj(diff) * diff);
   }
-  return sqrt(err/nrm);
+  return sqrt(err / nrm);
 }
-FLT errtwonorm(BIGINT n, CPX* a, CPX* b)
+FLT errtwonorm(BIGINT n, CPX *a, CPX *b)
 // ||a-b||_2
 {
-  FLT err = 0.0;   // compute error 2-norm
-  for (BIGINT m=0; m<n; ++m) {
-    CPX diff = a[m]-b[m];
-    err += real(conj(diff)*diff);
+  FLT err = 0.0; // compute error 2-norm
+  for (BIGINT m = 0; m < n; ++m) {
+    CPX diff = a[m] - b[m];
+    err += real(conj(diff) * diff);
   }
   return sqrt(err);
 }
-FLT twonorm(BIGINT n, CPX* a)
+FLT twonorm(BIGINT n, CPX *a)
 // ||a||_2
 {
   FLT nrm = 0.0;
-  for (BIGINT m=0; m<n; ++m)
-    nrm += real(conj(a[m])*a[m]);
+  for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]);
   return sqrt(nrm);
 }
-FLT infnorm(BIGINT n, CPX* a)
+FLT infnorm(BIGINT n, CPX *a)
 // ||a||_infty
 {
   FLT nrm = 0.0;
-  for (BIGINT m=0; m<n; ++m) {
-    FLT aa = real(conj(a[m])*a[m]);
-    if (aa>nrm) nrm = aa;
+  for (BIGINT m = 0; m < n; ++m) {
+    FLT aa = real(conj(a[m]) * a[m]);
+    if (aa > nrm) nrm = aa;
   }
   return sqrt(nrm);
 }
 
 // ------------ real array utils ---------------------------------
 
-void arrayrange(BIGINT n, FLT* a, FLT *lo, FLT *hi)
+void arrayrange(BIGINT n, FLT *a, FLT *lo, FLT *hi)
 // With a a length-n array, writes out min(a) to lo and max(a) to hi,
 // so that all a values lie in [lo,hi].
 // If n==0, lo and hi are not finite.
 {
-  *lo = INFINITY; *hi = -INFINITY;
-  for (BIGINT m=0; m<n; ++m) {
-    if (a[m]<*lo) *lo = a[m];
-    if (a[m]>*hi) *hi = a[m];
+  *lo = INFINITY;
+  *hi = -INFINITY;
+  for (BIGINT m = 0; m < n; ++m) {
+    if (a[m] < *lo) *lo = a[m];
+    if (a[m] > *hi) *hi = a[m];
   }
 }
 
-void arraywidcen(BIGINT n, FLT* a, FLT *w, FLT *c)
+void arraywidcen(BIGINT n, FLT *a, FLT *w, FLT *c)
 // Writes out w = half-width and c = center of an interval enclosing all a[n]'s
 // Only chooses a nonzero center if this increases w by less than fraction
 // ARRAYWIDCEN_GROWFRAC defined in defs.h.
 // This prevents rephasings which don't grow nf by much. 6/8/17
 // If n==0, w and c are not finite.
 {
-  FLT lo,hi;
-  arrayrange(n,a,&lo,&hi);
-  *w = (hi-lo)/2;
-  *c = (hi+lo)/2;
-  if (std::abs(*c)<ARRAYWIDCEN_GROWFRAC*(*w)) {
+  FLT lo, hi;
+  arrayrange(n, a, &lo, &hi);
+  *w = (hi - lo) / 2;
+  *c = (hi + lo) / 2;
+  if (std::abs(*c) < ARRAYWIDCEN_GROWFRAC * (*w)) {
     *w += std::abs(*c);
     *c = 0.0;
   }
 }
 
-  }  // namespace
-}  // namespace
+} // namespace utils
+} // namespace finufft
diff --git a/src/utils_precindep.cpp b/src/utils_precindep.cpp
index 48c7fc0df..194fae7f0 100644
--- a/src/utils_precindep.cpp
+++ b/src/utils_precindep.cpp
@@ -5,38 +5,39 @@
 
 #include <cstdint>
 
-#include "finufft/utils_precindep.h"
 #include "finufft/defs.h"
+#include "finufft/utils_precindep.h"
 using namespace std;
 
 namespace finufft {
-  namespace utils {
+namespace utils {
 
 BIGINT next235even(BIGINT n)
 // finds even integer not less than n, with prime factors no larger than 5
 // (ie, "smooth"). Adapted from fortran in hellskitchen.  Barnett 2/9/17
 // changed INT64 type 3/28/17. Runtime is around n*1e-11 sec for big n.
 {
-  if (n<=2) return 2;
-  if (n%2 == 1) n+=1;   // even
-  BIGINT nplus = n-2;   // to cancel out the +=2 at start of loop
-  BIGINT numdiv = 2;    // a dummy that is >1
-  while (numdiv>1) {
-    nplus += 2;         // stays even
+  if (n <= 2) return 2;
+  if (n % 2 == 1) n += 1;                // even
+  BIGINT nplus  = n - 2;                 // to cancel out the +=2 at start of loop
+  BIGINT numdiv = 2;                     // a dummy that is >1
+  while (numdiv > 1) {
+    nplus += 2;                          // stays even
     numdiv = nplus;
-    while (numdiv%2 == 0) numdiv /= 2;  // remove all factors of 2,3,5...
-    while (numdiv%3 == 0) numdiv /= 3;
-    while (numdiv%5 == 0) numdiv /= 5;
+    while (numdiv % 2 == 0) numdiv /= 2; // remove all factors of 2,3,5...
+    while (numdiv % 3 == 0) numdiv /= 3;
+    while (numdiv % 5 == 0) numdiv /= 5;
   }
   return nplus;
 }
 
 // ----------------------- helpers for timing (always stay double prec) ------
-  
-void CNTime::start()
-{
+
+void CNTime::start() {
   initial = std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::steady_clock::now().time_since_epoch()).count()*1e-6;
+                std::chrono::steady_clock::now().time_since_epoch())
+                .count() *
+            1e-6;
 }
 
 double CNTime::restart()
@@ -51,12 +52,12 @@ double CNTime::elapsedsec()
 // returns answers as double, in seconds, to microsec accuracy. Barnett 5/22/18
 {
   std::uint64_t now = std::chrono::duration_cast<std::chrono::microseconds>(
-                        std::chrono::steady_clock::now().time_since_epoch()).count();
-  const double nowsec = now*1e-6;
+                          std::chrono::steady_clock::now().time_since_epoch())
+                          .count();
+  const double nowsec = now * 1e-6;
   return nowsec - initial;
 }
 
-
 // -------------------------- openmp helpers -------------------------------
 int get_num_threads_parallel_block()
 // return how many threads an omp parallel block would use.
@@ -72,19 +73,18 @@ int get_num_threads_parallel_block()
   return nth_used;
 }
 
-
 // ---------- thread-safe rand number generator for Windows platform ---------
 // (note this is used by macros in defs.h, and supplied in linux/macosx)
 #ifdef _WIN32
 int rand_r(unsigned int *seedp)
 // Libin Lu, 6/18/20
 {
-    std::random_device rd;
-    std::default_random_engine generator(rd());
-    std::uniform_int_distribution<int> distribution(0,RAND_MAX);
-    return distribution(generator);
+  std::random_device rd;
+  std::default_random_engine generator(rd());
+  std::uniform_int_distribution<int> distribution(0, RAND_MAX);
+  return distribution(generator);
 }
 #endif
 
-  } // namespace
-} // namespace
+} // namespace utils
+} // namespace finufft
diff --git a/test/basicpassfail.cpp b/test/basicpassfail.cpp
index c3648d878..8d2a78633 100644
--- a/test/basicpassfail.cpp
+++ b/test/basicpassfail.cpp
@@ -6,40 +6,40 @@
 // Simplified from Amit Moscovitz and example1d1. Barnett 11/1/18.
 // Using vectors and default opts, 2/29/20; dual-prec lib 7/3/20.
 
-int main()
-{
-  BIGINT M = 1e3, N = 1e3;   // defaults: M = # srcs, N = # modes out
-  double tol = 1e-5;         // req tol, covers both single & double prec cases
-  int isign = +1;            // exponential sign for NUFFT
-  static const CPX I = CPX(0.0,1.0);  // imaginary unit. Note: avoid (CPX) cast
-  std::vector<CPX> F(N);     // alloc output mode coeffs
+int main() {
+  BIGINT M = 1e3, N = 1e3;            // defaults: M = # srcs, N = # modes out
+  double tol         = 1e-5;          // req tol, covers both single & double prec cases
+  int isign          = +1;            // exponential sign for NUFFT
+  static const CPX I = CPX(0.0, 1.0); // imaginary unit. Note: avoid (CPX) cast
+  std::vector<CPX> F(N);              // alloc output mode coeffs
 
   // Make the input data....................................
-  srand(42);                 // seed
-  std::vector<FLT> x(M);     // NU pts locs
-  std::vector<CPX> c(M);     // strengths 
-  for (BIGINT j=0; j<M; ++j) {
-    x[j] = M_PI*(2*((FLT)rand()/(FLT)RAND_MAX)-1);     // uniform random in [-pi,pi)
-    c[j] = 2*((FLT)rand()/(FLT)RAND_MAX)-1 + I*(2*((FLT)rand()/(FLT)RAND_MAX)-1);
+  srand(42);                                               // seed
+  std::vector<FLT> x(M);                                   // NU pts locs
+  std::vector<CPX> c(M);                                   // strengths
+  for (BIGINT j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1); // uniform random in
+                                                           // [-pi,pi)
+    c[j] = 2 * ((FLT)rand() / (FLT)RAND_MAX) - 1 +
+           I * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1);
   }
   // Run it (NULL = default opts) .......................................
-  int ier = FINUFFT1D1(M,&x[0],&c[0],isign,tol,N,&F[0],NULL);
-  if (ier!=0) {
-    printf("basicpassfail: finufft1d1 error (ier=%d)!",ier);
+  int ier = FINUFFT1D1(M, &x[0], &c[0], isign, tol, N, &F[0], NULL);
+  if (ier != 0) {
+    printf("basicpassfail: finufft1d1 error (ier=%d)!", ier);
     exit(ier);
   }
   // Check correct math for a single mode...................
-  BIGINT n = (BIGINT)(0.37*N);   // choose some mode near the top (N/2)
-  CPX Ftest = CPX(0.0,0.0);      // crude exact answer & error check...
-  for (BIGINT j=0; j<M; ++j)
-    Ftest += c[j] * exp((FLT)isign*I*(FLT)n*x[j]);
-  BIGINT nout = n+N/2;           // index in output array for freq mode n
-  FLT Finfnrm = 0.0;             // compute inf norm of F...
-  for (int m=0; m<N; ++m) {
-    FLT aF = abs(F[m]);          // note C++ abs complex type, not C fabs(f)
-    if (aF>Finfnrm) Finfnrm=aF;
+  BIGINT n  = (BIGINT)(0.37 * N); // choose some mode near the top (N/2)
+  CPX Ftest = CPX(0.0, 0.0);      // crude exact answer & error check...
+  for (BIGINT j = 0; j < M; ++j) Ftest += c[j] * exp((FLT)isign * I * (FLT)n * x[j]);
+  BIGINT nout = n + N / 2;        // index in output array for freq mode n
+  FLT Finfnrm = 0.0;              // compute inf norm of F...
+  for (int m = 0; m < N; ++m) {
+    FLT aF = abs(F[m]);           // note C++ abs complex type, not C fabs(f)
+    if (aF > Finfnrm) Finfnrm = aF;
   }
-  FLT relerr = abs(F[nout] - Ftest)/Finfnrm;
-  //printf("requested tol %.3g: rel err for one mode %.3g\n",tol,relerr);
-  return (std::isnan(relerr) || relerr > 10.0*tol);    // true reports failure
+  FLT relerr = abs(F[nout] - Ftest) / Finfnrm;
+  // printf("requested tol %.3g: rel err for one mode %.3g\n",tol,relerr);
+  return (std::isnan(relerr) || relerr > 10.0 * tol); // true reports failure
 }
diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu
index bb2d96758..05b62025e 100644
--- a/test/cuda/cufinufft1d_test.cu
+++ b/test/cuda/cufinufft1d_test.cu
@@ -16,190 +16,193 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
+template<typename T>
 int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) {
-    std::cout << std::scientific << std::setprecision(3);
-    int ier;
-
-    thrust::host_vector<T> x(M);
-    thrust::host_vector<thrust::complex<T>> c(M);
-    thrust::host_vector<thrust::complex<T>> fk(N1);
-
-    thrust::device_vector<T> d_x(M);
-    thrust::device_vector<thrust::complex<T>> d_c(M);
-    thrust::device_vector<thrust::complex<T>> d_fk(N1);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
+  std::cout << std::scientific << std::setprecision(3);
+  int ier;
+
+  thrust::host_vector<T> x(M);
+  thrust::host_vector<thrust::complex<T>> c(M);
+  thrust::host_vector<thrust::complex<T>> fk(N1);
+
+  thrust::device_vector<T> d_x(M);
+  thrust::device_vector<thrust::complex<T>> d_c(M);
+  thrust::device_vector<thrust::complex<T>> d_fk(N1);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+  }
+  if (type == 1) {
     for (int i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
-    if (type == 1) {
-        for (int i = 0; i < M; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-    } else if (type == 2) {
-        for (int i = 0; i < N1; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return 1;
+  } else if (type == 2) {
+    for (int i = 0; i < N1; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-
-    d_x = x;
-    if (type == 1)
-        d_c = c;
-    else if (type == 2)
-        d_fk = fk;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    float totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to the test...
-    cufinufft_plan_t<T> *dplan;
-    const int dim = 1;
-
-    // Here we setup our own opts, for gpu_method.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_maxbatchsize = 1;
-
-    int nmodes[3] = {N1, 1, 1};
-    int ntransf = 1;
-    cudaEventRecord(start);
-
-    ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft1d_plan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), NULL, NULL, 0, NULL, NULL, NULL, dplan);
-
-    if (ier != 0) {
-        printf("err: cufinufft_setpts\n");
-        return ier;
-    }
-
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), dplan);
-
-    if (ier != 0) {
-        printf("err: cufinufft1d_exec\n");
-        return ier;
-    }
-
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    float exec_ms = milliseconds;
-    printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    if (ier != 0) {
-        printf("err %d: cufinufft1d_destroy\n", ier);
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    printf("[Method %d] %d U pts to %d NU pts in %.3g s:      %.3g NU pts/s\n", opts.gpu_method, N1, M,
-           totaltime / 1000, M / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
-
-    T rel_error = std::numeric_limits<T>::max();
-    if (type == 1) {
-        fk = d_fk;
-        int nt1 = 0.37 * N1; // choose some mode index to check
-        thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
-        for (int j = 0; j < M; ++j)
-            Ft += c[j] * exp(J * (nt1 * x[j])); // crude direct
-        int it = N1 / 2 + nt1;                  // index in complex F as 1d array
-
-        rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
-        printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", nt1, rel_error);
-    } else if (type == 2) {
-        c = d_c;
-
-        int jt = M / 2; // check arbitrary choice of one targ pt
-        thrust::complex<T> J = thrust::complex<T>(0, iflag);
-        thrust::complex<T> ct = thrust::complex<T>(0, 0);
-        int m = 0;
-        for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-            ct += fk[m++] * exp(J * (m1 * x[jt])); // crude direct
-        rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
-        printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
-    }
-
-    return std::isnan(rel_error) || rel_error > checktol;
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return 1;
+  }
+
+  d_x = x;
+  if (type == 1)
+    d_c = c;
+  else if (type == 2)
+    d_fk = fk;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  float totaltime    = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to the test...
+  cufinufft_plan_t<T> *dplan;
+  const int dim = 1;
+
+  // Here we setup our own opts, for gpu_method.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_maxbatchsize = 1;
+
+  int nmodes[3] = {N1, 1, 1};
+  int ntransf   = 1;
+  cudaEventRecord(start);
+
+  ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft1d_plan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), NULL, NULL, 0, NULL, NULL, NULL,
+                                 dplan);
+
+  if (ier != 0) {
+    printf("err: cufinufft_setpts\n");
+    return ier;
+  }
+
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(),
+                                  (cuda_complex<T> *)d_fk.data().get(), dplan);
+
+  if (ier != 0) {
+    printf("err: cufinufft1d_exec\n");
+    return ier;
+  }
+
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  float exec_ms = milliseconds;
+  printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  if (ier != 0) {
+    printf("err %d: cufinufft1d_destroy\n", ier);
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  printf("[Method %d] %d U pts to %d NU pts in %.3g s:      %.3g NU pts/s\n",
+         opts.gpu_method, N1, M, totaltime / 1000, M / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
+
+  T rel_error = std::numeric_limits<T>::max();
+  if (type == 1) {
+    fk                    = d_fk;
+    int nt1               = 0.37 * N1; // choose some mode index to check
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
+    for (int j = 0; j < M; ++j) Ft += c[j] * exp(J * (nt1 * x[j])); // crude direct
+    int it = N1 / 2 + nt1; // index in complex F as 1d array
+
+    rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d] is %.3g\n", nt1, rel_error);
+  } else if (type == 2) {
+    c = d_c;
+
+    int jt                = M / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> ct = thrust::complex<T>(0, 0);
+    int m                 = 0;
+    for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+      ct += fk[m++] * exp(J * (m1 * x[jt])); // crude direct
+    rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
+    printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
+  }
+
+  return std::isnan(rel_error) || rel_error > checktol;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 8) {
-        fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven\n"
-                        "  type: Type of transform (1, 2)\n"
-                        "  N1: Number of fourier modes\n"
-                        "  M: The number of non-uniform points\n"
-                        "  tol: NUFFT tolerance\n"
-                        "  checktol:  relative error to pass test\n"
-                        "  precision: f or d\n");
-        return 1;
-    }
-    const int method = atoi(argv[1]);
-    const int type = atoi(argv[2]);
-    const int N1 = atof(argv[3]);
-    const int M = atof(argv[4]);
-    const double tol = atof(argv[5]);
-    const double checktol = atof(argv[6]);
-    const int iflag = 1;
-    const char prec = argv[7][0];
-    if (prec == 'f')
-        return run_test<float>(method, type, N1, M, tol, checktol, iflag);
-    else if (prec == 'd')
-        return run_test<double>(method, type, N1, M, tol, checktol, iflag);
-    else
-        return -1;
+  if (argc != 8) {
+    fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n"
+                    "Arguments:\n"
+                    "  method: One of\n"
+                    "    1: nupts driven\n"
+                    "  type: Type of transform (1, 2)\n"
+                    "  N1: Number of fourier modes\n"
+                    "  M: The number of non-uniform points\n"
+                    "  tol: NUFFT tolerance\n"
+                    "  checktol:  relative error to pass test\n"
+                    "  precision: f or d\n");
+    return 1;
+  }
+  const int method      = atoi(argv[1]);
+  const int type        = atoi(argv[2]);
+  const int N1          = atof(argv[3]);
+  const int M           = atof(argv[4]);
+  const double tol      = atof(argv[5]);
+  const double checktol = atof(argv[6]);
+  const int iflag       = 1;
+  const char prec       = argv[7][0];
+  if (prec == 'f')
+    return run_test<float>(method, type, N1, M, tol, checktol, iflag);
+  else if (prec == 'd')
+    return run_test<double>(method, type, N1, M, tol, checktol, iflag);
+  else
+    return -1;
 }
diff --git a/test/cuda/cufinufft2d1nupts_test.cu b/test/cuda/cufinufft2d1nupts_test.cu
index 409c42625..6817712df 100644
--- a/test/cuda/cufinufft2d1nupts_test.cu
+++ b/test/cuda/cufinufft2d1nupts_test.cu
@@ -18,207 +18,213 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
-int run_test(int method) {
-    int N1 = 100;
-    int N2 = 100;
-    int N = N1 * N2;
-    int M1 = N1 * N2;
-    int M2 = 2 * N1 * N2;
-
-    T tol = 1e-5;
-    int iflag = 1;
-
-    std::cout << std::scientific << std::setprecision(3);
-    int ier;
-
-    thrust::host_vector<T> x1(M1), y1(M1);
-    thrust::host_vector<thrust::complex<T>> c1(M1), fk1(N1 * N2);
-    thrust::device_vector<T> d_x1(M1), d_y1(M1);
-    thrust::device_vector<thrust::complex<T>> d_c1(M1), d_fk1(N1 * N2);
-
-    thrust::host_vector<T> x2(M2), y2(M2);
-    thrust::host_vector<thrust::complex<T>> c2(M2), fk2(N1 * N2);
-    thrust::device_vector<T> d_x2(M2), d_y2(M2);
-    thrust::device_vector<thrust::complex<T>> d_c2(M2), d_fk2(N1 * N2);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
-    for (int i = 0; i < M1; i++) {
-        x1[i] = M_PI * randm11(); // x in [-pi,pi)
-        y1[i] = M_PI * randm11();
-        c1[i].real(randm11());
-        c1[i].imag(randm11());
-    }
-
-    for (int i = 0; i < M2; i++) {
-        x2[i] = M_PI * randm11(); // x in [-pi,pi)
-        y2[i] = M_PI * randm11();
-        c2[i].real(randm11());
-        c2[i].imag(randm11());
-    }
-
-    d_x1 = x1;
-    d_y1 = y1;
-    d_c1 = c1;
-    d_x2 = x2;
-    d_y2 = y2;
-    d_c2 = c2;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    float totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to our tests...
-    cufinufft_plan_t<T> *dplan;
-    int dim = 2;
-    int type = 1;
-
-    // Here we setup our own opts, for gpu_method.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_maxbatchsize = 1;
-
-    int nmodes[3];
-    int ntransf = 1;
-
-    nmodes[0] = N1;
-    nmodes[1] = N2;
-    nmodes[2] = 1;
-    cudaEventRecord(start);
-    ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft2d_plan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M1, d_x1.data().get(), d_y1.data().get(), NULL, 0, NULL, NULL, NULL, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts (set 1)\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts (set 1):\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c1.data().get(), (cuda_complex<T> *)d_fk1.data().get(), dplan);
-
-    if (ier != 0) {
-        printf("err: cufinufft2d1_exec (set 1)\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    float exec_ms = milliseconds;
-    printf("[time  ] cufinufft exec (set 1):\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M2, d_x2.data().get(), d_y2.data().get(), NULL, 0, NULL, NULL, NULL, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts (set 2)\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts (set 2):\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c2.data().get(), (cuda_complex<T> *)d_fk2.data().get(), dplan);
-    if (ier != 0) {
-        printf("err: cufinufft2d1_exec (set 2)\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    exec_ms += milliseconds;
-    printf("[time  ] cufinufft exec (set 2):\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    fk1 = d_fk1;
-    fk2 = d_fk2;
-
-    printf("[Method %d] (%d+%d) NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n", opts.gpu_method, M1, M2, N1 * N2,
-           totaltime / 1000, (M1 + M2) / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", (M1 + M2) / exec_ms * 1000);
-
-    int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
-    thrust::complex<T> Ft(0, 0), J(0, iflag);
-    for (int j = 0; j < M1; ++j)
-        Ft += c1[j] * exp(J * (nt1 * x1[j] + nt2 * y1[j])); // crude direct
-    int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);            // index in complex F as 1d array
-
-    printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g (set 1)\n", (int)nt1, (int)nt2,
-           abs(Ft - fk1[it]) / infnorm(N, (std::complex<T> *)fk1.data()));
-    Ft = thrust::complex<T>(0, 0);
-    for (int j = 0; j < M2; ++j)
-        Ft += c2[j] * exp(J * (nt1 * x2[j] + nt2 * y2[j])); // crude direct
-    printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g (set 2)\n", (int)nt1, (int)nt2,
-           abs(Ft - fk2[it]) / infnorm(N, (std::complex<T> *)fk2.data()));
-
-    return 0;
+template<typename T> int run_test(int method) {
+  int N1 = 100;
+  int N2 = 100;
+  int N  = N1 * N2;
+  int M1 = N1 * N2;
+  int M2 = 2 * N1 * N2;
+
+  T tol     = 1e-5;
+  int iflag = 1;
+
+  std::cout << std::scientific << std::setprecision(3);
+  int ier;
+
+  thrust::host_vector<T> x1(M1), y1(M1);
+  thrust::host_vector<thrust::complex<T>> c1(M1), fk1(N1 * N2);
+  thrust::device_vector<T> d_x1(M1), d_y1(M1);
+  thrust::device_vector<thrust::complex<T>> d_c1(M1), d_fk1(N1 * N2);
+
+  thrust::host_vector<T> x2(M2), y2(M2);
+  thrust::host_vector<thrust::complex<T>> c2(M2), fk2(N1 * N2);
+  thrust::device_vector<T> d_x2(M2), d_y2(M2);
+  thrust::device_vector<thrust::complex<T>> d_c2(M2), d_fk2(N1 * N2);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M1; i++) {
+    x1[i] = M_PI * randm11(); // x in [-pi,pi)
+    y1[i] = M_PI * randm11();
+    c1[i].real(randm11());
+    c1[i].imag(randm11());
+  }
+
+  for (int i = 0; i < M2; i++) {
+    x2[i] = M_PI * randm11(); // x in [-pi,pi)
+    y2[i] = M_PI * randm11();
+    c2[i].real(randm11());
+    c2[i].imag(randm11());
+  }
+
+  d_x1 = x1;
+  d_y1 = y1;
+  d_c1 = c1;
+  d_x2 = x2;
+  d_y2 = y2;
+  d_c2 = c2;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  float totaltime    = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to our tests...
+  cufinufft_plan_t<T> *dplan;
+  int dim  = 2;
+  int type = 1;
+
+  // Here we setup our own opts, for gpu_method.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_maxbatchsize = 1;
+
+  int nmodes[3];
+  int ntransf = 1;
+
+  nmodes[0] = N1;
+  nmodes[1] = N2;
+  nmodes[2] = 1;
+  cudaEventRecord(start);
+  ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft2d_plan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M1, d_x1.data().get(), d_y1.data().get(), NULL, 0, NULL,
+                                 NULL, NULL, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts (set 1)\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts (set 1):\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c1.data().get(),
+                                  (cuda_complex<T> *)d_fk1.data().get(), dplan);
+
+  if (ier != 0) {
+    printf("err: cufinufft2d1_exec (set 1)\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  float exec_ms = milliseconds;
+  printf("[time  ] cufinufft exec (set 1):\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M2, d_x2.data().get(), d_y2.data().get(), NULL, 0, NULL,
+                                 NULL, NULL, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts (set 2)\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts (set 2):\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c2.data().get(),
+                                  (cuda_complex<T> *)d_fk2.data().get(), dplan);
+  if (ier != 0) {
+    printf("err: cufinufft2d1_exec (set 2)\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  exec_ms += milliseconds;
+  printf("[time  ] cufinufft exec (set 2):\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  fk1 = d_fk1;
+  fk2 = d_fk2;
+
+  printf("[Method %d] (%d+%d) NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n",
+         opts.gpu_method, M1, M2, N1 * N2, totaltime / 1000,
+         (M1 + M2) / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", (M1 + M2) / exec_ms * 1000);
+
+  int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
+  thrust::complex<T> Ft(0, 0), J(0, iflag);
+  for (int j = 0; j < M1; ++j)
+    Ft += c1[j] * exp(J * (nt1 * x1[j] + nt2 * y1[j])); // crude direct
+  int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);          // index in complex F as 1d array
+
+  printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g (set 1)\n", (int)nt1, (int)nt2,
+         abs(Ft - fk1[it]) / infnorm(N, (std::complex<T> *)fk1.data()));
+  Ft = thrust::complex<T>(0, 0);
+  for (int j = 0; j < M2; ++j)
+    Ft += c2[j] * exp(J * (nt1 * x2[j] + nt2 * y2[j])); // crude direct
+  printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g (set 2)\n", (int)nt1, (int)nt2,
+         abs(Ft - fk2[it]) / infnorm(N, (std::complex<T> *)fk2.data()));
+
+  return 0;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc < 3) {
-        fprintf(stderr, "Usage: cufinufft2d1nupts_test method\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven,\n"
-                        "    2: sub-problem, or\n"
-                        "  precision: f or d\n");
-        return 1;
-    }
-    int method;
-    sscanf(argv[1], "%d", &method);
-    char prec = argv[2][0];
-
-    if (prec == 'f')
-        return run_test<float>(method);
-    else if (prec == 'd')
-        return run_test<double>(method);
-    else
-        fprintf(stderr, "Invalid precision supplied: %s\n", argv[2]);
-
+  if (argc < 3) {
+    fprintf(stderr, "Usage: cufinufft2d1nupts_test method\n"
+                    "Arguments:\n"
+                    "  method: One of\n"
+                    "    1: nupts driven,\n"
+                    "    2: sub-problem, or\n"
+                    "  precision: f or d\n");
     return 1;
+  }
+  int method;
+  sscanf(argv[1], "%d", &method);
+  char prec = argv[2][0];
+
+  if (prec == 'f')
+    return run_test<float>(method);
+  else if (prec == 'd')
+    return run_test<double>(method);
+  else
+    fprintf(stderr, "Invalid precision supplied: %s\n", argv[2]);
+
+  return 1;
 }
diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu
index 371b44b2f..4157f6230 100644
--- a/test/cuda/cufinufft2d_test.cu
+++ b/test/cuda/cufinufft2d_test.cu
@@ -17,189 +17,195 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
+template<typename T>
 int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag) {
-    std::cout << std::scientific << std::setprecision(3);
-
-    thrust::host_vector<T> x(M), y(M);
-    thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2);
-
-    thrust::device_vector<T> d_x(M), d_y(M);
-    thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
+  std::cout << std::scientific << std::setprecision(3);
+
+  thrust::host_vector<T> x(M), y(M);
+  thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2);
+
+  thrust::device_vector<T> d_x(M), d_y(M);
+  thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    y[i] = M_PI * randm11();
+  }
+  if (type == 1) {
     for (int i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
-        y[i] = M_PI * randm11();
-    }
-    if (type == 1) {
-        for (int i = 0; i < M; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-    } else if (type == 2) {
-        for (int i = 0; i < N1 * N2; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return 1;
-    }
-
-    d_x = x;
-    d_y = y;
-    if (type == 1)
-        d_c = c;
-    else if (type == 2)
-        d_fk = fk;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    float totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to our tests...
-    cufinufft_plan_t<T> *dplan;
-    const int dim = 2;
-
-    // Here we setup our own opts, for gpu_method.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_maxbatchsize = 1;
-
-    int nmodes[3] = {N1, N2, 1};
-    int ntransf = 1;
-    cudaEventRecord(start);
-    int ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft2d_plan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), nullptr, 0, nullptr, nullptr, nullptr, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts\n");
-        return ier;
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), dplan);
-    if (ier != 0) {
-        printf("err: cufinufft2d1_exec\n");
-        return ier;
+  } else if (type == 2) {
+    for (int i = 0; i < N1 * N2; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    float exec_ms = milliseconds;
-    printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    if (type == 1)
-        fk = d_fk;
-    else if (type == 2)
-        c = d_c;
-
-    printf("[Method %d] %d NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n", opts.gpu_method, M, N1 * N2,
-           totaltime / 1000, M / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
-
-    T rel_error = std::numeric_limits<T>::max();
-    if (type == 1) {
-        const int nt1 = 0.37 * N1;
-        const int nt2 = 0.26 * N2; // choose some mode index to check
-        thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
-        for (int j = 0; j < M; ++j)
-            Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
-        const int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);   // index in complex F as 1d array
-
-        rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
-        printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g\n", nt1, nt2, rel_error);
-    } else if (type == 2) {
-        int jt = M / 2; // check arbitrary choice of one targ pt
-        thrust::complex<T> J = thrust::complex<T>(0, iflag);
-        thrust::complex<T> ct = thrust::complex<T>(0, 0);
-
-        int m = 0;
-        for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
-            for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-                ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
-
-        rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
-        printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
-    }
-
-    return std::isnan(rel_error) || rel_error > checktol;
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return 1;
+  }
+
+  d_x = x;
+  d_y = y;
+  if (type == 1)
+    d_c = c;
+  else if (type == 2)
+    d_fk = fk;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  float totaltime    = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to our tests...
+  cufinufft_plan_t<T> *dplan;
+  const int dim = 2;
+
+  // Here we setup our own opts, for gpu_method.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_maxbatchsize = 1;
+
+  int nmodes[3] = {N1, N2, 1};
+  int ntransf   = 1;
+  cudaEventRecord(start);
+  int ier =
+      cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft2d_plan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), nullptr, 0,
+                                 nullptr, nullptr, nullptr, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(),
+                                  (cuda_complex<T> *)d_fk.data().get(), dplan);
+  if (ier != 0) {
+    printf("err: cufinufft2d1_exec\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  float exec_ms = milliseconds;
+  printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  if (type == 1)
+    fk = d_fk;
+  else if (type == 2)
+    c = d_c;
+
+  printf("[Method %d] %d NU pts to %d U pts in %.3g s:      %.3g NU pts/s\n",
+         opts.gpu_method, M, N1 * N2, totaltime / 1000, M / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
+
+  T rel_error = std::numeric_limits<T>::max();
+  if (type == 1) {
+    const int nt1         = 0.37 * N1;
+    const int nt2         = 0.26 * N2; // choose some mode index to check
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
+    for (int j = 0; j < M; ++j)
+      Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+    const int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d
+                                                       // array
+
+    rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d,%d] is %.3g\n", nt1, nt2, rel_error);
+  } else if (type == 2) {
+    int jt                = M / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> ct = thrust::complex<T>(0, 0);
+
+    int m = 0;
+    for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+      for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+        ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+
+    rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
+    printf("[gpu   ] one targ: rel err in c[%d] is %.3g\n", jt, rel_error);
+  }
+
+  return std::isnan(rel_error) || rel_error > checktol;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 9) {
-        fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven,\n"
-                        "    2: sub-problem, or\n"
-                        "  type: Type of transform (1, 2)"
-                        "  N1, N2: The size of the 2D array\n"
-                        "  M: The number of non-uniform points\n"
-                        "  tol: NUFFT tolerance\n"
-                        "  checktol: relative error to pass test\n"
-                        "  prec:  'f' or 'd' (float/double)\n");
-        return 1;
-    }
-    const int method = atoi(argv[1]);
-    const int type = atoi(argv[2]);
-    const int N1 = atof(argv[3]);
-    const int N2 = atof(argv[4]);
-    const int M = atof(argv[5]);
-    const double tol = atof(argv[6]);
-    const double checktol = atof(argv[7]);
-    const char prec = argv[8][0];
-    const int iflag = 1;
-
-    if (prec == 'f')
-        return run_test<float>(method, type, N1, N2, M, tol, checktol, iflag);
-    else if (prec == 'd')
-        return run_test<double>(method, type, N1, N2, M, tol, checktol, iflag);
-    else
-        return -1;
+  if (argc != 9) {
+    fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n"
+                    "Arguments:\n"
+                    "  method: One of\n"
+                    "    1: nupts driven,\n"
+                    "    2: sub-problem, or\n"
+                    "  type: Type of transform (1, 2)"
+                    "  N1, N2: The size of the 2D array\n"
+                    "  M: The number of non-uniform points\n"
+                    "  tol: NUFFT tolerance\n"
+                    "  checktol: relative error to pass test\n"
+                    "  prec:  'f' or 'd' (float/double)\n");
+    return 1;
+  }
+  const int method      = atoi(argv[1]);
+  const int type        = atoi(argv[2]);
+  const int N1          = atof(argv[3]);
+  const int N2          = atof(argv[4]);
+  const int M           = atof(argv[5]);
+  const double tol      = atof(argv[6]);
+  const double checktol = atof(argv[7]);
+  const char prec       = argv[8][0];
+  const int iflag       = 1;
+
+  if (prec == 'f')
+    return run_test<float>(method, type, N1, N2, M, tol, checktol, iflag);
+  else if (prec == 'd')
+    return run_test<double>(method, type, N1, N2, M, tol, checktol, iflag);
+  else
+    return -1;
 }
diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu
index 96f3cecf3..b4f3529e1 100644
--- a/test/cuda/cufinufft2dmany_test.cu
+++ b/test/cuda/cufinufft2dmany_test.cu
@@ -17,195 +17,209 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
-int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M, T tol, T checktol, int iflag) {
-    std::cout << std::scientific << std::setprecision(3);
-
-    int ier;
-    const int N = N1 * N2;
-    printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M);
-
-    thrust::host_vector<T> x(M), y(M);
-    thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(ntransf * N1 * N2);
-
-    thrust::device_vector<T> d_x(M), d_y(M);
-    thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(ntransf * N1 * N2);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
-    for (int i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
-        y[i] = M_PI * randm11();
-    }
-    if (type == 1) {
-        for (int i = 0; i < ntransf * M; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-    } else if (type == 2) {
-        for (int i = 0; i < ntransf * N1 * N2; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return 1;
+template<typename T>
+int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M,
+             T tol, T checktol, int iflag) {
+  std::cout << std::scientific << std::setprecision(3);
+
+  int ier;
+  const int N = N1 * N2;
+  printf("#modes = %d, #inputs = %d, #NUpts = %d\n", N, ntransf, M);
+
+  thrust::host_vector<T> x(M), y(M);
+  thrust::host_vector<thrust::complex<T>> c(M * ntransf), fk(ntransf * N1 * N2);
+
+  thrust::device_vector<T> d_x(M), d_y(M);
+  thrust::device_vector<thrust::complex<T>> d_c(M * ntransf), d_fk(ntransf * N1 * N2);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    y[i] = M_PI * randm11();
+  }
+  if (type == 1) {
+    for (int i = 0; i < ntransf * M; i++) {
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
-
-    d_x = x;
-    d_y = y;
-    if (type == 1)
-        d_c = c;
-    else if (type == 2)
-        d_fk = fk;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    double totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  } else if (type == 2) {
+    for (int i = 0; i < ntransf * N1 * N2; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to the test...
-    cufinufft_plan_t<T> *dplan;
-    int dim = 2;
-
-    // Here we setup our own opts, for gpu_method.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_maxbatchsize = maxbatchsize;
-
-    int nmodes[3] = {N1, N2, 1};
-    cudaEventRecord(start);
-    ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft2d_plan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), NULL, 0, NULL, NULL, NULL, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), dplan);
-    if (ier != 0) {
-        printf("err: cufinufft2d_exec\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    float exec_ms = milliseconds;
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    if (type == 1)
-        fk = d_fk;
-    else if (type == 2)
-        c = d_c;
-
-    T rel_error = std::numeric_limits<T>::max();
-    if (type == 1) {
-        int i = ntransf - 1;                                // // choose some data to check
-        int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to check
-        thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
-        for (int j = 0; j < M; ++j)
-            Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
-        int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2);                 // index in complex F as 1d array
-        rel_error = abs(Ft - fk[it + i * N]) / infnorm(N1, (std::complex<T> *)fk.data() + i * N);
-        printf("[gpu   ] %dth data one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2, rel_error);
-    } else if (type == 2) {
-        const int t = ntransf - 1;
-        thrust::complex<T> *fkstart = fk.data() + t * N1 * N2;
-        const thrust::complex<T> *cstart = c.data() + t * M;
-        const int jt = M / 2; // check arbitrary choice of one targ pt
-        const thrust::complex<T> J(0, iflag);
-        thrust::complex<T> ct(0, 0);
-        int m = 0;
-        for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
-            for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-                ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
-
-        rel_error = abs(cstart[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
-        printf("[gpu   ] %dth data one targ: rel err in c[%d] is %.3g\n", t, jt, rel_error);
-    }
-
-    printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime * 1000, M * ntransf / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M * ntransf / exec_ms * 1000);
-    return std::isnan(rel_error) || rel_error > checktol;
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return 1;
+  }
+
+  d_x = x;
+  d_y = y;
+  if (type == 1)
+    d_c = c;
+  else if (type == 2)
+    d_fk = fk;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  double totaltime   = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to the test...
+  cufinufft_plan_t<T> *dplan;
+  int dim = 2;
+
+  // Here we setup our own opts, for gpu_method.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_maxbatchsize = maxbatchsize;
+
+  int nmodes[3] = {N1, N2, 1};
+  cudaEventRecord(start);
+  ier = cufinufft_makeplan_impl<T>(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft2d_plan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), NULL, 0, NULL,
+                                 NULL, NULL, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(),
+                                  (cuda_complex<T> *)d_fk.data().get(), dplan);
+  if (ier != 0) {
+    printf("err: cufinufft2d_exec\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  float exec_ms = milliseconds;
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  if (type == 1)
+    fk = d_fk;
+  else if (type == 2)
+    c = d_c;
+
+  T rel_error = std::numeric_limits<T>::max();
+  if (type == 1) {
+    int i   = ntransf - 1;                              // // choose some data to check
+    int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2); // choose some mode index to
+                                                        // check
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
+    for (int j = 0; j < M; ++j)
+      Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+    int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+    rel_error =
+        abs(Ft - fk[it + i * N]) / infnorm(N1, (std::complex<T> *)fk.data() + i * N);
+    printf("[gpu   ] %dth data one mode: rel err in F[%d,%d] is %.3g\n", i, nt1, nt2,
+           rel_error);
+  } else if (type == 2) {
+    const int t                      = ntransf - 1;
+    thrust::complex<T> *fkstart      = fk.data() + t * N1 * N2;
+    const thrust::complex<T> *cstart = c.data() + t * M;
+    const int jt                     = M / 2; // check arbitrary choice of one targ pt
+    const thrust::complex<T> J(0, iflag);
+    thrust::complex<T> ct(0, 0);
+    int m = 0;
+    for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+      for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+        ct += fkstart[m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+
+    rel_error = abs(cstart[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
+    printf("[gpu   ] %dth data one targ: rel err in c[%d] is %.3g\n", t, jt, rel_error);
+  }
+
+  printf("[totaltime] %.3g us, speed %.3g NUpts/s\n", totaltime * 1000,
+         M * ntransf / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n",
+         M * ntransf / exec_ms * 1000);
+  return std::isnan(rel_error) || rel_error > checktol;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc != 11) {
-        fprintf(stderr, "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M tol checktol prec\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven,\n"
-                        "    2: sub-problem, or\n"
-                        "  type: Type of transform (1, 2)\n"
-                        "  N1, N2: The size of the 2D array\n"
-                        "  ntransf: Number of inputs\n"
-                        "  maxbatchsize: Number of simultaneous transforms (or 0 for default)\n"
-                        "  M: The number of non-uniform points\n"
-                        "  tol: NUFFT tolerance\n"
-                        "  checktol: relative error to pass test\n"
-                        "  prec:  'f' or 'd' (float/double)\n");
-        return 1;
-    }
-    const int method = atoi(argv[1]);
-    const int type = atoi(argv[2]);
-    const int N1 = atof(argv[3]);
-    const int N2 = atof(argv[4]);
-    const int ntransf = atof(argv[5]);
-    const int maxbatchsize = atoi(argv[6]);
-    const int M = atoi(argv[7]);
-    const double tol = atof(argv[8]);
-    const double checktol = atof(argv[9]);
-    const char prec = argv[10][0];
-    const int iflag = 1;
-
-    if (prec == 'f')
-        return run_test<float>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, iflag);
-    else if (prec == 'd')
-        return run_test<double>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, iflag);
-    else
-        return -1;
+  if (argc != 11) {
+    fprintf(stderr,
+            "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M "
+            "tol checktol prec\n"
+            "Arguments:\n"
+            "  method: One of\n"
+            "    1: nupts driven,\n"
+            "    2: sub-problem, or\n"
+            "  type: Type of transform (1, 2)\n"
+            "  N1, N2: The size of the 2D array\n"
+            "  ntransf: Number of inputs\n"
+            "  maxbatchsize: Number of simultaneous transforms (or 0 for default)\n"
+            "  M: The number of non-uniform points\n"
+            "  tol: NUFFT tolerance\n"
+            "  checktol: relative error to pass test\n"
+            "  prec:  'f' or 'd' (float/double)\n");
+    return 1;
+  }
+  const int method       = atoi(argv[1]);
+  const int type         = atoi(argv[2]);
+  const int N1           = atof(argv[3]);
+  const int N2           = atof(argv[4]);
+  const int ntransf      = atof(argv[5]);
+  const int maxbatchsize = atoi(argv[6]);
+  const int M            = atoi(argv[7]);
+  const double tol       = atof(argv[8]);
+  const double checktol  = atof(argv[9]);
+  const char prec        = argv[10][0];
+  const int iflag        = 1;
+
+  if (prec == 'f')
+    return run_test<float>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol,
+                           iflag);
+  else if (prec == 'd')
+    return run_test<double>(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol,
+                            iflag);
+  else
+    return -1;
 }
diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu
index a882f6715..933dda36d 100644
--- a/test/cuda/cufinufft3d_test.cu
+++ b/test/cuda/cufinufft3d_test.cu
@@ -17,198 +17,210 @@
 
 using cufinufft::utils::infnorm;
 
-template <typename T>
-int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol, int iflag) {
-    std::cout << std::scientific << std::setprecision(3);
-    int ier;
-
-    thrust::host_vector<T> x(M), y(M), z(M);
-    thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2 * N3);
-
-    thrust::device_vector<T> d_x(M), d_y(M), d_z(M);
-    thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2 * N3);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<T> dist11(-1, 1);
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    // Making data
+template<typename T>
+int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol,
+             int iflag) {
+  std::cout << std::scientific << std::setprecision(3);
+  int ier;
+
+  thrust::host_vector<T> x(M), y(M), z(M);
+  thrust::host_vector<thrust::complex<T>> c(M), fk(N1 * N2 * N3);
+
+  thrust::device_vector<T> d_x(M), d_y(M), d_z(M);
+  thrust::device_vector<thrust::complex<T>> d_c(M), d_fk(N1 * N2 * N3);
+
+  std::default_random_engine eng(1);
+  std::uniform_real_distribution<T> dist11(-1, 1);
+  auto randm11 = [&eng, &dist11]() {
+    return dist11(eng);
+  };
+
+  // Making data
+  for (int i = 0; i < M; i++) {
+    x[i] = M_PI * randm11(); // x in [-pi,pi)
+    y[i] = M_PI * randm11();
+    z[i] = M_PI * randm11();
+  }
+  if (type == 1) {
     for (int i = 0; i < M; i++) {
-        x[i] = M_PI * randm11(); // x in [-pi,pi)
-        y[i] = M_PI * randm11();
-        z[i] = M_PI * randm11();
-    }
-    if (type == 1) {
-        for (int i = 0; i < M; i++) {
-            c[i].real(randm11());
-            c[i].imag(randm11());
-        }
-    } else if (type == 2) {
-        for (int i = 0; i < N1 * N2 * N3; i++) {
-            fk[i].real(randm11());
-            fk[i].imag(randm11());
-        }
-    } else {
-        std::cerr << "Invalid type " << type << " supplied\n";
-        return 1;
-    }
-
-    d_x = x;
-    d_y = y;
-    d_z = z;
-
-    if (type == 1)
-        d_c = c;
-    else if (type == 2)
-        d_fk = fk;
-
-    cudaEvent_t start, stop;
-    float milliseconds = 0;
-    float totaltime = 0;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-
-    // warm up CUFFT (is slow, takes around 0.2 sec... )
-    cudaEventRecord(start);
-    {
-        int nf1 = 1;
-        cufftHandle fftplan;
-        cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
-
-    // now to the test...
-    cufinufft_plan_t<T> *dplan;
-    int dim = 3;
-
-    // Here we setup our own opts, for gpu_method and gpu_kerevalmeth.
-    cufinufft_opts opts;
-    cufinufft_default_opts(&opts);
-
-    opts.gpu_method = method;
-    opts.gpu_kerevalmeth = 1;
-    opts.gpu_maxbatchsize = 1;
-
-    int nmodes[3] = {N1, N2, N3};
-    int ntransf = 1;
-
-    cudaEventRecord(start);
-    ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
-    if (ier != 0) {
-        printf("err: cufinufft_makeplan\n");
-        return ier;
-    }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), d_z.data().get(), 0, nullptr, nullptr,
-                                   nullptr, dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_setpts\n");
-        return ier;
+      c[i].real(randm11());
+      c[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(), (cuda_complex<T> *)d_fk.data().get(), dplan);
-    if (ier != 0) {
-        printf("err: cufinufft_execute\n");
-        return ier;
+  } else if (type == 2) {
+    for (int i = 0; i < N1 * N2 * N3; i++) {
+      fk[i].real(randm11());
+      fk[i].imag(randm11());
     }
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    float exec_ms = milliseconds;
-    printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
-
-    cudaEventRecord(start);
-    ier = cufinufft_destroy_impl<T>(dplan);
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&milliseconds, start, stop);
-    totaltime += milliseconds;
-    printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
-
-    if (type == 1)
-        fk = d_fk;
-    else if (type == 2)
-        c = d_c;
-
-    printf("[Method %d] %d NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n", opts.gpu_method, M, N1 * N2 * N3,
-           totaltime / 1000, M / totaltime * 1000);
-    printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
-
-    T rel_error = std::numeric_limits<T>::max();
-    if (type == 1) {
-        int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2), nt3 = (int)(0.13 * N3); // choose some mode index to check
-        thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
-        for (int j = 0; j < M; ++j)
-            Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j])); // crude direct
-
-        int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3); // index in complex F as 1d array
-        rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
-        printf("[gpu   ] one mode: rel err in F[%d,%d,%d] is %.3g\n", nt1, nt2, nt3, rel_error);
-    } else if (type == 2) {
-        int jt = M / 2; // check arbitrary choice of one targ pt
-        thrust::complex<T> J = thrust::complex<T>(0, iflag);
-        thrust::complex<T> ct = thrust::complex<T>(0, 0);
-
-        int m = 0;
-        for (int m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3)     // loop in correct order over F
-            for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
-                for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
-                    ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude direct
-
-        rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
-        printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n", (int64_t)jt, rel_error);
-    }
-
-    return std::isnan(rel_error) || rel_error > checktol;
+  } else {
+    std::cerr << "Invalid type " << type << " supplied\n";
+    return 1;
+  }
+
+  d_x = x;
+  d_y = y;
+  d_z = z;
+
+  if (type == 1)
+    d_c = c;
+  else if (type == 2)
+    d_fk = fk;
+
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  float totaltime    = 0;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  // warm up CUFFT (is slow, takes around 0.2 sec... )
+  cudaEventRecord(start);
+  {
+    int nf1 = 1;
+    cufftHandle fftplan;
+    cufftPlan1d(&fftplan, nf1, cufft_type<T>(), 1);
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  printf("[time  ] dummy warmup call to CUFFT\t %.3g s\n", milliseconds / 1000);
+
+  // now to the test...
+  cufinufft_plan_t<T> *dplan;
+  int dim = 3;
+
+  // Here we setup our own opts, for gpu_method and gpu_kerevalmeth.
+  cufinufft_opts opts;
+  cufinufft_default_opts(&opts);
+
+  opts.gpu_method       = method;
+  opts.gpu_kerevalmeth  = 1;
+  opts.gpu_maxbatchsize = 1;
+
+  int nmodes[3] = {N1, N2, N3};
+  int ntransf   = 1;
+
+  cudaEventRecord(start);
+  ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+  if (ier != 0) {
+    printf("err: cufinufft_makeplan\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft plan:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_setpts_impl<T>(M, d_x.data().get(), d_y.data().get(), d_z.data().get(),
+                                 0, nullptr, nullptr, nullptr, dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_setpts\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft setNUpts:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_execute_impl<T>((cuda_complex<T> *)d_c.data().get(),
+                                  (cuda_complex<T> *)d_fk.data().get(), dplan);
+  if (ier != 0) {
+    printf("err: cufinufft_execute\n");
+    return ier;
+  }
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  float exec_ms = milliseconds;
+  printf("[time  ] cufinufft exec:\t\t %.3g s\n", milliseconds / 1000);
+
+  cudaEventRecord(start);
+  ier = cufinufft_destroy_impl<T>(dplan);
+  cudaEventRecord(stop);
+  cudaEventSynchronize(stop);
+  cudaEventElapsedTime(&milliseconds, start, stop);
+  totaltime += milliseconds;
+  printf("[time  ] cufinufft destroy:\t\t %.3g s\n", milliseconds / 1000);
+
+  if (type == 1)
+    fk = d_fk;
+  else if (type == 2)
+    c = d_c;
+
+  printf("[Method %d] %d NU pts to %d U pts in %.3g s:\t%.3g NU pts/s\n", opts.gpu_method,
+         M, N1 * N2 * N3, totaltime / 1000, M / totaltime * 1000);
+  printf("\t\t\t\t\t(exec-only thoughput: %.3g NU pts/s)\n", M / exec_ms * 1000);
+
+  T rel_error = std::numeric_limits<T>::max();
+  if (type == 1) {
+    int nt1 = (int)(0.37 * N1), nt2 = (int)(0.26 * N2),
+        nt3               = (int)(0.13 * N3); // choose some mode index to check
+    thrust::complex<T> Ft = thrust::complex<T>(0, 0), J = thrust::complex<T>(0.0, iflag);
+    for (int j = 0; j < M; ++j)
+      Ft += c[j] * exp(J * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j])); // crude direct
+
+    int it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3); // index
+                                                                            // in
+                                                                            // complex
+                                                                            // F as 1d
+                                                                            // array
+    rel_error = abs(Ft - fk[it]) / infnorm(N1, (std::complex<T> *)fk.data());
+    printf("[gpu   ] one mode: rel err in F[%d,%d,%d] is %.3g\n", nt1, nt2, nt3,
+           rel_error);
+  } else if (type == 2) {
+    int jt                = M / 2; // check arbitrary choice of one targ pt
+    thrust::complex<T> J  = thrust::complex<T>(0, iflag);
+    thrust::complex<T> ct = thrust::complex<T>(0, 0);
+
+    int m = 0;
+    for (int m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3)   // loop in correct order over F
+      for (int m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order
+                                                         // over F
+        for (int m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+          ct += fk[m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude direct
+
+    rel_error = abs(c[jt] - ct) / infnorm(M, (std::complex<T> *)c.data());
+    printf("[gpu   ] one targ: rel err in c[%ld] is %.3g\n", (int64_t)jt, rel_error);
+  }
+
+  return std::isnan(rel_error) || rel_error > checktol;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc < 10) {
-        fprintf(stderr, "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven,\n"
-                        "    2: sub-problem, or\n"
-                        "    4: block gather.\n"
-                        "  type: Type of transform (1, 2)"
-                        "  N1, N2, N3: The size of the 3D array\n"
-                        "  M: The number of non-uniform points\n"
-                        "  tol: NUFFT tolerance\n"
-                        "  checktol:  relative error to pass test\n"
-                        "  prec:  'f' or 'd' (float/double)\n");
-        return 1;
-    }
-    const int method = atoi(argv[1]);
-    const int type = atoi(argv[2]);
-    const int N1 = atof(argv[3]);
-    const int N2 = atof(argv[4]);
-    const int N3 = atof(argv[5]);
-    const int M = atof(argv[6]);
-    const double tol = atof(argv[7]);
-    const double checktol = atof(argv[8]);
-    const char prec = argv[9][0];
-    const int iflag = 1;
-
-    if (prec == 'f')
-        return run_test<float>(method, type, N1, N2, N3, M, tol, checktol, iflag);
-    else if (prec == 'd')
-        return run_test<double>(method, type, N1, N2, N3, M, tol, checktol, iflag);
-    else
-        return -1;
+  if (argc < 10) {
+    fprintf(stderr,
+            "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n"
+            "Arguments:\n"
+            "  method: One of\n"
+            "    1: nupts driven,\n"
+            "    2: sub-problem, or\n"
+            "    4: block gather.\n"
+            "  type: Type of transform (1, 2)"
+            "  N1, N2, N3: The size of the 3D array\n"
+            "  M: The number of non-uniform points\n"
+            "  tol: NUFFT tolerance\n"
+            "  checktol:  relative error to pass test\n"
+            "  prec:  'f' or 'd' (float/double)\n");
+    return 1;
+  }
+  const int method      = atoi(argv[1]);
+  const int type        = atoi(argv[2]);
+  const int N1          = atof(argv[3]);
+  const int N2          = atof(argv[4]);
+  const int N3          = atof(argv[5]);
+  const int M           = atof(argv[6]);
+  const double tol      = atof(argv[7]);
+  const double checktol = atof(argv[8]);
+  const char prec       = argv[9][0];
+  const int iflag       = 1;
+
+  if (prec == 'f')
+    return run_test<float>(method, type, N1, N2, N3, M, tol, checktol, iflag);
+  else if (prec == 'd')
+    return run_test<double>(method, type, N1, N2, N3, M, tol, checktol, iflag);
+  else
+    return -1;
 }
diff --git a/test/cuda/fseries_kernel_test.cu b/test/cuda/fseries_kernel_test.cu
index 7e1a5f728..7f18ee21c 100644
--- a/test/cuda/fseries_kernel_test.cu
+++ b/test/cuda/fseries_kernel_test.cu
@@ -13,155 +13,146 @@ using namespace cufinufft::common;
 using namespace cufinufft::spreadinterp;
 using namespace cufinufft::utils;
 
-template <typename T>
-int run_test(int nf1, int dim, T eps, int gpu, int nf2, int nf3) {
+template<typename T> int run_test(int nf1, int dim, T eps, int gpu, int nf2, int nf3) {
 
-    finufft_spread_opts opts;
-    T *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
-    T *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3;
-    checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1)));
-    if (dim > 1)
-        checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1)));
-    if (dim > 2)
-        checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1)));
+  finufft_spread_opts opts;
+  T *fwkerhalf1, *fwkerhalf2, *fwkerhalf3;
+  T *d_fwkerhalf1, *d_fwkerhalf2, *d_fwkerhalf3;
+  checkCudaErrors(cudaMalloc(&d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1)));
+  if (dim > 1) checkCudaErrors(cudaMalloc(&d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1)));
+  if (dim > 2) checkCudaErrors(cudaMalloc(&d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1)));
 
-    int ier = setup_spreader(opts, (T)eps, (T)2.0, 0);
+  int ier = setup_spreader(opts, (T)eps, (T)2.0, 0);
 
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
 
-    float milliseconds = 0;
-    float gputime = 0;
-    float cputime = 0;
+  float milliseconds = 0;
+  float gputime      = 0;
+  float cputime      = 0;
 
-    CNTime timer;
-    if (!gpu) {
-        timer.start();
-        fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
-        if (dim > 1)
-            fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
-        if (dim > 2)
-            fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
+  CNTime timer;
+  if (!gpu) {
+    timer.start();
+    fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
+    if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
+    if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
 
-        onedim_fseries_kernel(nf1, fwkerhalf1, opts);
-        if (dim > 1)
-            onedim_fseries_kernel(nf2, fwkerhalf2, opts);
-        if (dim > 2)
-            onedim_fseries_kernel(nf3, fwkerhalf3, opts);
-        cputime = timer.elapsedsec();
-        cudaEventRecord(start);
-        {
-            checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), cudaMemcpyHostToDevice));
-            if (dim > 1)
-                checkCudaErrors(
-                    cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), cudaMemcpyHostToDevice));
-            if (dim > 2)
-                checkCudaErrors(
-                    cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), cudaMemcpyHostToDevice));
-        }
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&milliseconds, start, stop);
-        gputime = milliseconds;
-        printf("[time  ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, opts.nspread, gputime + cputime * 1000);
-        free(fwkerhalf1);
-        if (dim > 1)
-            free(fwkerhalf2);
-        if (dim > 2)
-            free(fwkerhalf3);
-    } else {
-        timer.start();
-        std::complex<double> a[dim * MAX_NQUAD];
-        T f[dim * MAX_NQUAD];
-        onedim_fseries_kernel_precomp(nf1, f, a, opts);
-        if (dim > 1)
-            onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts);
-        if (dim > 2)
-            onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts);
-        cputime = timer.elapsedsec();
+    onedim_fseries_kernel(nf1, fwkerhalf1, opts);
+    if (dim > 1) onedim_fseries_kernel(nf2, fwkerhalf2, opts);
+    if (dim > 2) onedim_fseries_kernel(nf3, fwkerhalf3, opts);
+    cputime = timer.elapsedsec();
+    cudaEventRecord(start);
+    {
+      checkCudaErrors(cudaMemcpy(d_fwkerhalf1, fwkerhalf1, sizeof(T) * (nf1 / 2 + 1),
+                                 cudaMemcpyHostToDevice));
+      if (dim > 1)
+        checkCudaErrors(cudaMemcpy(d_fwkerhalf2, fwkerhalf2, sizeof(T) * (nf2 / 2 + 1),
+                                   cudaMemcpyHostToDevice));
+      if (dim > 2)
+        checkCudaErrors(cudaMemcpy(d_fwkerhalf3, fwkerhalf3, sizeof(T) * (nf3 / 2 + 1),
+                                   cudaMemcpyHostToDevice));
+    }
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    gputime = milliseconds;
+    printf("[time  ] dim=%d, nf1=%8d, ns=%2d, CPU: %6.2f ms\n", dim, nf1, opts.nspread,
+           gputime + cputime * 1000);
+    free(fwkerhalf1);
+    if (dim > 1) free(fwkerhalf2);
+    if (dim > 2) free(fwkerhalf3);
+  } else {
+    timer.start();
+    std::complex<double> a[dim * MAX_NQUAD];
+    T f[dim * MAX_NQUAD];
+    onedim_fseries_kernel_precomp(nf1, f, a, opts);
+    if (dim > 1) onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, opts);
+    if (dim > 2)
+      onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, opts);
+    cputime = timer.elapsedsec();
 
-        cuDoubleComplex *d_a;
-        T *d_f;
-        cudaEventRecord(start);
-        {
-            checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)));
-            checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)));
-            checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
-            checkCudaErrors(cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice));
-            ier = cufserieskernelcompute(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2, d_fwkerhalf3,
-                                         opts.nspread, cudaStreamDefault);
-        }
-        cudaEventRecord(stop);
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&milliseconds, start, stop);
-        gputime = milliseconds;
-        printf("[time  ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", dim, nf1, opts.nspread, gputime + cputime * 1000);
-        cudaFree(d_a);
-        cudaFree(d_f);
+    cuDoubleComplex *d_a;
+    T *d_f;
+    cudaEventRecord(start);
+    {
+      checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)));
+      checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)));
+      checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex),
+                                 cudaMemcpyHostToDevice));
+      checkCudaErrors(
+          cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice));
+      ier =
+          cufserieskernelcompute(dim, nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2,
+                                 d_fwkerhalf3, opts.nspread, cudaStreamDefault);
     }
+    cudaEventRecord(stop);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    gputime = milliseconds;
+    printf("[time  ] dim=%d, nf1=%8d, ns=%2d, GPU: %6.2f ms\n", dim, nf1, opts.nspread,
+           gputime + cputime * 1000);
+    cudaFree(d_a);
+    cudaFree(d_f);
+  }
 
-    fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
-    if (dim > 1)
-        fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
-    if (dim > 2)
-        fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
+  fwkerhalf1 = (T *)malloc(sizeof(T) * (nf1 / 2 + 1));
+  if (dim > 1) fwkerhalf2 = (T *)malloc(sizeof(T) * (nf2 / 2 + 1));
+  if (dim > 2) fwkerhalf3 = (T *)malloc(sizeof(T) * (nf3 / 2 + 1));
 
-    checkCudaErrors(cudaMemcpy(fwkerhalf1, d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1), cudaMemcpyDeviceToHost));
-    if (dim > 1)
-        checkCudaErrors(cudaMemcpy(fwkerhalf2, d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1), cudaMemcpyDeviceToHost));
-    if (dim > 2)
-        checkCudaErrors(cudaMemcpy(fwkerhalf3, d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1), cudaMemcpyDeviceToHost));
-    for (int i = 0; i < nf1 / 2 + 1; i++)
-        printf("%10.8e ", fwkerhalf1[i]);
-    printf("\n");
-    if (dim > 1)
-        for (int i = 0; i < nf2 / 2 + 1; i++)
-            printf("%10.8e ", fwkerhalf2[i]);
-    printf("\n");
-    if (dim > 2)
-        for (int i = 0; i < nf3 / 2 + 1; i++)
-            printf("%10.8e ", fwkerhalf3[i]);
-    printf("\n");
+  checkCudaErrors(cudaMemcpy(fwkerhalf1, d_fwkerhalf1, sizeof(T) * (nf1 / 2 + 1),
+                             cudaMemcpyDeviceToHost));
+  if (dim > 1)
+    checkCudaErrors(cudaMemcpy(fwkerhalf2, d_fwkerhalf2, sizeof(T) * (nf2 / 2 + 1),
+                               cudaMemcpyDeviceToHost));
+  if (dim > 2)
+    checkCudaErrors(cudaMemcpy(fwkerhalf3, d_fwkerhalf3, sizeof(T) * (nf3 / 2 + 1),
+                               cudaMemcpyDeviceToHost));
+  for (int i = 0; i < nf1 / 2 + 1; i++) printf("%10.8e ", fwkerhalf1[i]);
+  printf("\n");
+  if (dim > 1)
+    for (int i = 0; i < nf2 / 2 + 1; i++) printf("%10.8e ", fwkerhalf2[i]);
+  printf("\n");
+  if (dim > 2)
+    for (int i = 0; i < nf3 / 2 + 1; i++) printf("%10.8e ", fwkerhalf3[i]);
+  printf("\n");
 
-    return 0;
+  return 0;
 }
 
 int main(int argc, char *argv[]) {
-    if (argc < 3) {
-        fprintf(stderr, "Usage: onedim_fseries_kernel_test prec nf1 [dim [tol [gpuversion [nf2 [nf3]]]]]\n"
-                        "Arguments:\n"
-                        "  prec: 'f' or 'd' (float/double)\n"
-                        "  nf1: The size of the upsampled fine grid size in x.\n"
-                        "  dim: Dimension of the nuFFT.\n"
-                        "  tol: NUFFT tolerance (default 1e-6).\n"
-                        "  gpuversion: Use gpu version or not (default True).\n"
-                        "  nf2: The size of the upsampled fine grid size in y. (default nf1)\n"
-                        "  nf3: The size of the upsampled fine grid size in z. (default nf3)\n");
-        return 1;
-    }
-    char prec = argv[1][0];
-    int nf1 = std::atof(argv[2]);
-    int dim = 1;
-    double eps = 1e-6;
-    int gpu = 1;
-    int nf2 = nf1;
-    int nf3 = nf1;
-    if (argc > 3)
-        dim = std::atoi(argv[3]);
-    if (argc > 4)
-        eps = std::atof(argv[4]);
-    if (argc > 5)
-        gpu = std::atoi(argv[5]);
-    if (argc > 6)
-        nf2 = std::atoi(argv[6]);
-    if (argc > 7)
-        nf3 = std::atoi(argv[7]);
+  if (argc < 3) {
+    fprintf(stderr,
+            "Usage: onedim_fseries_kernel_test prec nf1 [dim [tol [gpuversion [nf2 "
+            "[nf3]]]]]\n"
+            "Arguments:\n"
+            "  prec: 'f' or 'd' (float/double)\n"
+            "  nf1: The size of the upsampled fine grid size in x.\n"
+            "  dim: Dimension of the nuFFT.\n"
+            "  tol: NUFFT tolerance (default 1e-6).\n"
+            "  gpuversion: Use gpu version or not (default True).\n"
+            "  nf2: The size of the upsampled fine grid size in y. (default nf1)\n"
+            "  nf3: The size of the upsampled fine grid size in z. (default nf3)\n");
+    return 1;
+  }
+  char prec  = argv[1][0];
+  int nf1    = std::atof(argv[2]);
+  int dim    = 1;
+  double eps = 1e-6;
+  int gpu    = 1;
+  int nf2    = nf1;
+  int nf3    = nf1;
+  if (argc > 3) dim = std::atoi(argv[3]);
+  if (argc > 4) eps = std::atof(argv[4]);
+  if (argc > 5) gpu = std::atoi(argv[5]);
+  if (argc > 6) nf2 = std::atoi(argv[6]);
+  if (argc > 7) nf3 = std::atoi(argv[7]);
 
-    if (prec == 'f')
-        return run_test<float>(nf1, dim, eps, gpu, nf2, nf3);
-    else if (prec == 'd')
-        return run_test<double>(nf1, dim, eps, gpu, nf2, nf3);
-    else
-        return -1;
+  if (prec == 'f')
+    return run_test<float>(nf1, dim, eps, gpu, nf2, nf3);
+  else if (prec == 'd')
+    return run_test<double>(nf1, dim, eps, gpu, nf2, nf3);
+  else
+    return -1;
 }
diff --git a/test/cuda/public_api_test.c b/test/cuda/public_api_test.c
index f4a938508..7d0dceef2 100644
--- a/test/cuda/public_api_test.c
+++ b/test/cuda/public_api_test.c
@@ -1,190 +1,190 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include <cufinufft.h>
 #include <cuda_runtime.h>
+#include <cufinufft.h>
 
 #include <complex.h>
 
 int test_float(int M, int N) {
-    // Size of the grid as an array.
-    int64_t modes[1] = {N};
+  // Size of the grid as an array.
+  int64_t modes[1] = {N};
 
-    // Host pointers: frequencies (x), coefficients (c), and output (f).
-    float *x;
-    float _Complex *c;
-    float _Complex *f;
+  // Host pointers: frequencies (x), coefficients (c), and output (f).
+  float *x;
+  float _Complex *c;
+  float _Complex *f;
 
-    // Device pointers.
-    float *d_x;
-    cuFloatComplex *d_c, *d_f;
+  // Device pointers.
+  float *d_x;
+  cuFloatComplex *d_c, *d_f;
 
-    // Store cufinufft plan.
-    cufinufftf_plan plan;
+  // Store cufinufft plan.
+  cufinufftf_plan plan;
 
-    // Manual calculation at a single point idx.
-    int idx;
-    float _Complex f0;
+  // Manual calculation at a single point idx.
+  int idx;
+  float _Complex f0;
 
-    // Allocate the host arrays.
-    x = (float *) malloc(M * sizeof(float));
-    c = (float _Complex *) malloc(M * sizeof(float _Complex));
-    f = (float _Complex *) malloc(N * sizeof(float _Complex));
+  // Allocate the host arrays.
+  x = (float *)malloc(M * sizeof(float));
+  c = (float _Complex *)malloc(M * sizeof(float _Complex));
+  f = (float _Complex *)malloc(N * sizeof(float _Complex));
 
-    // Fill with random numbers. Frequencies must be in the interval [-pi, pi]
-    // while strengths can be any value.
-    srand(0);
+  // Fill with random numbers. Frequencies must be in the interval [-pi, pi]
+  // while strengths can be any value.
+  srand(0);
 
-    for(int j = 0; j < M; ++j) {
-        x[j] = 2 * M_PI * (((float) rand()) / RAND_MAX - 1);
-        c[j] = (2 * ((float) rand()) / RAND_MAX - 1)
-               + I * (2 * ((float) rand()) / RAND_MAX - 1);
-    }
+  for (int j = 0; j < M; ++j) {
+    x[j] = 2 * M_PI * (((float)rand()) / RAND_MAX - 1);
+    c[j] =
+        (2 * ((float)rand()) / RAND_MAX - 1) + I * (2 * ((float)rand()) / RAND_MAX - 1);
+  }
 
-    // Allocate the device arrays and copy the x and c arrays.
-    cudaMalloc((void **)&d_x, M * sizeof(float));
-    cudaMalloc((void **)&d_c, M * sizeof(float _Complex));
-    cudaMalloc((void **)&d_f, N * sizeof(float _Complex));
+  // Allocate the device arrays and copy the x and c arrays.
+  cudaMalloc((void **)&d_x, M * sizeof(float));
+  cudaMalloc((void **)&d_c, M * sizeof(float _Complex));
+  cudaMalloc((void **)&d_f, N * sizeof(float _Complex));
 
-    cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, M * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c, M * sizeof(float _Complex), cudaMemcpyHostToDevice);
 
-    // Make the cufinufft plan for a 1D type-1 transform with six digits of
-    // tolerance.
-    cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
+  // Make the cufinufft plan for a 1D type-1 transform with six digits of
+  // tolerance.
+  cufinufftf_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
 
-    // Set the frequencies of the nonuniform points.
-    cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
+  // Set the frequencies of the nonuniform points.
+  cufinufftf_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
 
-    // Actually execute the plan on the given coefficients and store the result
-    // in the d_f array.
-    cufinufftf_execute(plan, d_c, d_f);
+  // Actually execute the plan on the given coefficients and store the result
+  // in the d_f array.
+  cufinufftf_execute(plan, d_c, d_f);
 
-    // Copy the result back onto the host.
-    cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
+  // Copy the result back onto the host.
+  cudaMemcpy(f, d_f, N * sizeof(float _Complex), cudaMemcpyDeviceToHost);
 
-    // Destroy the plan and free the device arrays after we're done.
-    cufinufftf_destroy(plan);
+  // Destroy the plan and free the device arrays after we're done.
+  cufinufftf_destroy(plan);
 
-    cudaFree(d_x);
-    cudaFree(d_c);
-    cudaFree(d_f);
+  cudaFree(d_x);
+  cudaFree(d_c);
+  cudaFree(d_f);
 
-    // Pick an index to check the result of the calculation.
-    idx = 4 * N / 7;
+  // Pick an index to check the result of the calculation.
+  idx = 4 * N / 7;
 
-    printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
+  printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
 
-    // Calculate the result manually using the formula for the type-1
-    // transform.
-    f0 = 0;
+  // Calculate the result manually using the formula for the type-1
+  // transform.
+  f0 = 0;
 
-    for(int j = 0; j < M; ++j) {
-        f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
-    }
+  for (int j = 0; j < M; ++j) {
+    f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
+  }
 
-    printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
+  printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
 
-    // Finally free the host arrays.
-    free(x);
-    free(c);
-    free(f);
+  // Finally free the host arrays.
+  free(x);
+  free(c);
+  free(f);
 
-    return 0;
+  return 0;
 }
 
 int test_double(int M, int N) {
-    // Size of the grid as an array.
-    int64_t modes[1] = {N};
+  // Size of the grid as an array.
+  int64_t modes[1] = {N};
 
-    // Host pointers: frequencies (x), coefficients (c), and output (f).
-    double *x;
-    double _Complex *c;
-    double _Complex *f;
+  // Host pointers: frequencies (x), coefficients (c), and output (f).
+  double *x;
+  double _Complex *c;
+  double _Complex *f;
 
-    // Device pointers.
-    double *d_x;
-    cuDoubleComplex *d_c, *d_f;
+  // Device pointers.
+  double *d_x;
+  cuDoubleComplex *d_c, *d_f;
 
-    // Store cufinufft plan.
-    cufinufft_plan plan;
+  // Store cufinufft plan.
+  cufinufft_plan plan;
 
-    // Manual calculation at a single point idx.
-    int idx;
-    double _Complex f0;
+  // Manual calculation at a single point idx.
+  int idx;
+  double _Complex f0;
 
-    // Allocate the host arrays.
-    x = (double *) malloc(M * sizeof(double));
-    c = (double _Complex *) malloc(M * sizeof(double _Complex));
-    f = (double _Complex *) malloc(N * sizeof(double _Complex));
+  // Allocate the host arrays.
+  x = (double *)malloc(M * sizeof(double));
+  c = (double _Complex *)malloc(M * sizeof(double _Complex));
+  f = (double _Complex *)malloc(N * sizeof(double _Complex));
 
-    // Fill with random numbers. Frequencies must be in the interval [-pi, pi]
-    // while strengths can be any value.
-    srand(0);
+  // Fill with random numbers. Frequencies must be in the interval [-pi, pi]
+  // while strengths can be any value.
+  srand(0);
 
-    for(int j = 0; j < M; ++j) {
-        x[j] = 2 * M_PI * (((double) rand()) / RAND_MAX - 1);
-        c[j] = (2 * ((double) rand()) / RAND_MAX - 1)
-               + I * (2 * ((double) rand()) / RAND_MAX - 1);
-    }
+  for (int j = 0; j < M; ++j) {
+    x[j] = 2 * M_PI * (((double)rand()) / RAND_MAX - 1);
+    c[j] =
+        (2 * ((double)rand()) / RAND_MAX - 1) + I * (2 * ((double)rand()) / RAND_MAX - 1);
+  }
 
-    // Allocate the device arrays and copy the x and c arrays.
-    cudaMalloc((void **)&d_x, M * sizeof(double));
-    cudaMalloc((void **)&d_c, M * sizeof(double _Complex));
-    cudaMalloc((void **)&d_f, N * sizeof(double _Complex));
+  // Allocate the device arrays and copy the x and c arrays.
+  cudaMalloc((void **)&d_x, M * sizeof(double));
+  cudaMalloc((void **)&d_c, M * sizeof(double _Complex));
+  cudaMalloc((void **)&d_f, N * sizeof(double _Complex));
 
-    cudaMemcpy(d_x, x, M * sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_c, c, M * sizeof(double _Complex), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x, M * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_c, c, M * sizeof(double _Complex), cudaMemcpyHostToDevice);
 
-    // Make the cufinufft plan for a 1D type-1 transform with six digits of
-    // tolerance.
-    cufinufft_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
+  // Make the cufinufft plan for a 1D type-1 transform with six digits of
+  // tolerance.
+  cufinufft_makeplan(1, 1, modes, 1, 1, 1e-6, &plan, NULL);
 
-    // Set the frequencies of the nonuniform points.
-    cufinufft_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
+  // Set the frequencies of the nonuniform points.
+  cufinufft_setpts(plan, M, d_x, NULL, NULL, 0, NULL, NULL, NULL);
 
-    // Actually execute the plan on the given coefficients and store the result
-    // in the d_f array.
-    cufinufft_execute(plan, d_c, d_f);
+  // Actually execute the plan on the given coefficients and store the result
+  // in the d_f array.
+  cufinufft_execute(plan, d_c, d_f);
 
-    // Copy the result back onto the host.
-    cudaMemcpy(f, d_f, N * sizeof(double _Complex), cudaMemcpyDeviceToHost);
+  // Copy the result back onto the host.
+  cudaMemcpy(f, d_f, N * sizeof(double _Complex), cudaMemcpyDeviceToHost);
 
-    // Destroy the plan and free the device arrays after we're done.
-    cufinufft_destroy(plan);
+  // Destroy the plan and free the device arrays after we're done.
+  cufinufft_destroy(plan);
 
-    cudaFree(d_x);
-    cudaFree(d_c);
-    cudaFree(d_f);
+  cudaFree(d_x);
+  cudaFree(d_c);
+  cudaFree(d_f);
 
-    // Pick an index to check the result of the calculation.
-    idx = 4 * N / 7;
+  // Pick an index to check the result of the calculation.
+  idx = 4 * N / 7;
 
-    printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
+  printf("f[%d] = %lf + %lfi\n", idx, crealf(f[idx]), cimagf(f[idx]));
 
-    // Calculate the result manually using the formula for the type-1
-    // transform.
-    f0 = 0;
+  // Calculate the result manually using the formula for the type-1
+  // transform.
+  f0 = 0;
 
-    for(int j = 0; j < M; ++j) {
-        f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
-    }
+  for (int j = 0; j < M; ++j) {
+    f0 += c[j] * cexp(I * x[j] * (idx - N / 2));
+  }
 
-    printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
+  printf("f0[%d] = %lf + %lfi\n", idx, crealf(f0), cimagf(f0));
 
-    // Finally free the host arrays.
-    free(x);
-    free(c);
-    free(f);
+  // Finally free the host arrays.
+  free(x);
+  free(c);
+  free(f);
 
-    return 0;
+  return 0;
 }
 
 int main() {
-    // Problem size: number of nonuniform points (M) and grid size (N).
-    const int M = 100, N = 200;
-    int errf = test_float(M, N);
-    int err = test_double(M, N);
+  // Problem size: number of nonuniform points (M) and grid size (N).
+  const int M = 100, N = 200;
+  int errf = test_float(M, N);
+  int err  = test_double(M, N);
 
-    return (err | errf);
+  return (err | errf);
 }
diff --git a/test/cuda/test_makeplan.c b/test/cuda/test_makeplan.c
index e9cb247ef..0d51c7170 100644
--- a/test/cuda/test_makeplan.c
+++ b/test/cuda/test_makeplan.c
@@ -15,206 +15,226 @@
 #include <cufinufft.h>
 
 typedef struct {
-    char *p[2];
+  char *p[2];
 } wasteful_pointers;
 
 // hackish way to make allocation failures happen
 wasteful_pointers alloc_remaining_device_mem() {
-    wasteful_pointers a = {NULL, NULL};
-    for (int i = 0; i < 2; ++i) {
-        size_t free, total;
-        cudaMemGetInfo(&free, &total);
-
-        int ier = 1;
-        int iter = 0;
-        while (ier && (iter < 60)) {
-            ier = cudaMalloc((void **)&a.p[i], free - (1 << iter));
-            iter++;
-        }
+  wasteful_pointers a = {NULL, NULL};
+  for (int i = 0; i < 2; ++i) {
+    size_t free, total;
+    cudaMemGetInfo(&free, &total);
+
+    int ier  = 1;
+    int iter = 0;
+    while (ier && (iter < 60)) {
+      ier = cudaMalloc((void **)&a.p[i], free - (1 << iter));
+      iter++;
     }
+  }
 
-    return a;
+  return a;
 }
 
 void free_wasteful_pointers(wasteful_pointers a) {
-    cudaFree(a.p[0]);
-    cudaFree(a.p[1]);
+  cudaFree(a.p[0]);
+  cudaFree(a.p[1]);
 }
 
 int main() {
-    cufinufftf_plan plan;
-    // defaults. tests should shadow them to override
-    const int dim = 1;
-    const int type = 1;
-    const int iflag = 1;
-    const float tol = 1e-5;
+  cufinufftf_plan plan;
+  // defaults. tests should shadow them to override
+  const int dim      = 1;
+  const int type     = 1;
+  const int iflag    = 1;
+  const float tol    = 1e-5;
+  const int ntransf  = 1;
+  const int64_t N[3] = {10, 20, 15};
+
+  // Dimension failure
+  {
+    const int dim = 0;
+    assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+           FINUFFT_ERR_DIM_NOTVALID);
+    cudaDeviceSynchronize();
+  }
+
+  // 1D failure modes
+  {
+    const int dim     = 1;
+    const int type    = 1;
     const int ntransf = 1;
-    const int64_t N[3] = {10, 20, 15};
 
-    // Dimension failure
+    // nice input should succeed
+    assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
+    cufinufftf_destroy(plan);
+    cudaDeviceSynchronize();
+
+    // Ignore higher dims, even if invalid
+    {
+      int64_t N[3] = {10, 0, 15};
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
+      cufinufftf_destroy(plan);
+      cudaDeviceSynchronize();
+    }
+
+    {
+      int64_t N[3] = {0, 20, 15};
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NDATA_NOTVALID);
+      cudaDeviceSynchronize();
+    }
+
+    // cufinufft can't handle arrays bigger than INT32_MAX (cufft limitation)
     {
-        const int dim = 0;
-        assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_DIM_NOTVALID);
-        cudaDeviceSynchronize();
+      int64_t N[3] = {(int64_t)INT32_MAX + 1, 1, 1};
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NDATA_NOTVALID);
+      cudaDeviceSynchronize();
     }
 
-    // 1D failure modes
     {
-        const int dim = 1;
-        const int type = 1;
-        const int ntransf = 1;
-
-        // nice input should succeed
-        assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
-        cufinufftf_destroy(plan);
-        cudaDeviceSynchronize();
-
-        // Ignore higher dims, even if invalid
-        {
-            int64_t N[3] = {10, 0, 15};
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
-            cufinufftf_destroy(plan);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            int64_t N[3] = {0, 20, 15};
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        // cufinufft can't handle arrays bigger than INT32_MAX (cufft limitation)
-        {
-            int64_t N[3] = {(int64_t)INT32_MAX + 1, 1, 1};
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            const int ntransf = 0;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NTRANS_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            const int type = 4;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_TYPE_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        /* { */
-        /*     wasteful_pointers p = alloc_remaining_device_mem(); */
-        /*     int64_t N[3] = {INT32_MAX, 1, 1}; */
-        /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_CUDA_FAILURE);
-         */
-        /*     free_wasteful_pointers(p); */
-        /* } */
+      const int ntransf = 0;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NTRANS_NOTVALID);
+      cudaDeviceSynchronize();
     }
 
     {
-        const int dim = 2;
-        assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
-        cudaDeviceSynchronize();
-
-        cufinufftf_destroy(plan);
-
-        {
-            int64_t N[3] = {10, 0, 1};
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        // FIXME: nf calculation overflows -- need to handle upsampling mode calculation properly
-        /* { */
-        /*     int64_t N[3] = {INT32_MAX / 2, 2, 1}; */
-        /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0); */
-        /* } */
-
-        {
-            int64_t N[3] = {INT32_MAX, 2, 1};
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            const int type = 4;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_TYPE_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            const int ntransf = 0;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NTRANS_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            cufinufft_opts opts;
-            cufinufft_default_opts(&opts);
-            opts.upsampfac = 0.9;
-            opts.gpu_kerevalmeth = 1;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) ==
-                   FINUFFT_ERR_HORNER_WRONG_BETA);
-
-            opts.gpu_kerevalmeth = 0;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) ==
-                   FINUFFT_ERR_UPSAMPFAC_TOO_SMALL);
-
-            // Should produce a warning, not an error
-            opts.upsampfac = 4.5;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) == 0);
-            cufinufftf_destroy(plan);
-            cudaDeviceSynchronize();
-        }
-
-        // This technique to cause cuda failures works most of the time, but sometimes would
-        // break following calls and could cause issues with other contexts using the same GPU
-        /* { */
-        /*     wasteful_pointers p = alloc_remaining_device_mem(); */
-        /*     int64_t N[3] = {sqrt(INT32_MAX - 1), sqrt(INT32_MAX) - 1, 1}; */
-        /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_CUDA_FAILURE);
-         */
-        /*     free_wasteful_pointers(p); */
-        /* } */
+      const int type = 4;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_TYPE_NOTVALID);
+      cudaDeviceSynchronize();
     }
 
+    /* { */
+    /*     wasteful_pointers p = alloc_remaining_device_mem(); */
+    /*     int64_t N[3] = {INT32_MAX, 1, 1}; */
+    /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL)
+     * == FINUFFT_ERR_CUDA_FAILURE);
+     */
+    /*     free_wasteful_pointers(p); */
+    /* } */
+  }
+
+  {
+    const int dim = 2;
+    assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
+    cudaDeviceSynchronize();
+
+    cufinufftf_destroy(plan);
+
     {
-        const int dim = 3;
-        assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
-        cufinufftf_destroy(plan);
-        cudaDeviceSynchronize();
-
-        {
-            int64_t N[3] = {10, 15, 0};
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            int64_t N[3] = {INT32_MAX / 2, 2, 2};
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            const int type = 4;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_TYPE_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        {
-            const int ntransf = 0;
-            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NTRANS_NOTVALID);
-            cudaDeviceSynchronize();
-        }
-
-        /* { */
-        /*     wasteful_pointers p = alloc_remaining_device_mem(); */
-        /*     int64_t N[3] = {pow(INT32_MAX - 1, 1.0 / 3), pow(INT32_MAX - 1, 1.0 / 3), pow(INT32_MAX - 1, 1.0 / 3)};
-         */
-        /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_CUDA_FAILURE);
-         */
-        /*     free_wasteful_pointers(p); */
-        /* } */
+      int64_t N[3] = {10, 0, 1};
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NDATA_NOTVALID);
+      cudaDeviceSynchronize();
     }
+
+    // FIXME: nf calculation overflows -- need to handle upsampling mode calculation
+    // properly
+    /* { */
+    /*     int64_t N[3] = {INT32_MAX / 2, 2, 1}; */
+    /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL)
+     * == 0); */
+    /* } */
+
+    {
+      int64_t N[3] = {INT32_MAX, 2, 1};
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NDATA_NOTVALID);
+      cudaDeviceSynchronize();
+    }
+
+    {
+      const int type = 4;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_TYPE_NOTVALID);
+      cudaDeviceSynchronize();
+    }
+
+    {
+      const int ntransf = 0;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NTRANS_NOTVALID);
+      cudaDeviceSynchronize();
+    }
+
+    {
+      cufinufft_opts opts;
+      cufinufft_default_opts(&opts);
+      opts.upsampfac       = 0.9;
+      opts.gpu_kerevalmeth = 1;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) ==
+             FINUFFT_ERR_HORNER_WRONG_BETA);
+
+      opts.gpu_kerevalmeth = 0;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) ==
+             FINUFFT_ERR_UPSAMPFAC_TOO_SMALL);
+
+      // Should produce a warning, not an error
+      opts.upsampfac = 4.5;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) == 0);
+      cufinufftf_destroy(plan);
+      cudaDeviceSynchronize();
+    }
+
+    // This technique to cause cuda failures works most of the time, but sometimes
+    // would break following calls and could cause issues with other contexts using
+    // the same GPU
+    /* { */
+    /*     wasteful_pointers p = alloc_remaining_device_mem(); */
+    /*     int64_t N[3] = {sqrt(INT32_MAX - 1), sqrt(INT32_MAX) - 1, 1}; */
+    /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL)
+     * == FINUFFT_ERR_CUDA_FAILURE);
+     */
+    /*     free_wasteful_pointers(p); */
+    /* } */
+  }
+
+  {
+    const int dim = 3;
+    assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
+    cufinufftf_destroy(plan);
+    cudaDeviceSynchronize();
+
+    {
+      int64_t N[3] = {10, 15, 0};
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NDATA_NOTVALID);
+      cudaDeviceSynchronize();
+    }
+
+    {
+      int64_t N[3] = {INT32_MAX / 2, 2, 2};
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NDATA_NOTVALID);
+      cudaDeviceSynchronize();
+    }
+
+    {
+      const int type = 4;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_TYPE_NOTVALID);
+      cudaDeviceSynchronize();
+    }
+
+    {
+      const int ntransf = 0;
+      assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) ==
+             FINUFFT_ERR_NTRANS_NOTVALID);
+      cudaDeviceSynchronize();
+    }
+
+    /* { */
+    /*     wasteful_pointers p = alloc_remaining_device_mem(); */
+    /*     int64_t N[3] = {pow(INT32_MAX - 1, 1.0 / 3), pow(INT32_MAX - 1, 1.0 / 3),
+     * pow(INT32_MAX - 1, 1.0 / 3)};
+     */
+    /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL)
+     * == FINUFFT_ERR_CUDA_FAILURE);
+     */
+    /*     free_wasteful_pointers(p); */
+    /* } */
+  }
 }
diff --git a/test/directft/dirft1d.cpp b/test/directft/dirft1d.cpp
index a52d826c4..5f36d76d7 100644
--- a/test/directft/dirft1d.cpp
+++ b/test/directft/dirft1d.cpp
@@ -1,14 +1,14 @@
-#include <finufft/dirft.h>
 #include <finufft/defs.h>
+#include <finufft/dirft.h>
 #include <iostream>
 
 // This is basically a port of dirft1d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft1d1(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f)
+void dirft1d1(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f)
 /* Direct computation of 1D type-1 nonuniform FFT. Interface same as finufft1d1.
 c                  nj-1
-c     f[k1]    =   SUM  c[j] exp(+-i k1 x[j]) 
+c     f[k1]    =   SUM  c[j] exp(+-i k1 x[j])
 c                  j=0
 c
 c     for -ms/2 <= k1 <= (ms-1)/2.
@@ -17,24 +17,24 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 1/25/17
 */
 {
-  BIGINT kmin = -(ms/2);                   // integer divide
-  for (BIGINT m=0;m<ms;++m) f[m] = CPX(0,0);   // it knows f is complex type
-  for (BIGINT j=0;j<nj;++j) {
-    CPX a = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX p = pow(a,(FLT)kmin);   // starting phase for most neg freq
+  BIGINT kmin = -(ms / 2);                          // integer divide
+  for (BIGINT m = 0; m < ms; ++m) f[m] = CPX(0, 0); // it knows f is complex type
+  for (BIGINT j = 0; j < nj; ++j) {
+    CPX a  = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX p  = pow(a, (FLT)kmin); // starting phase for most neg freq
     CPX cc = c[j];              // no 1/nj prefac
-    for (BIGINT m=0;m<ms;++m) {
+    for (BIGINT m = 0; m < ms; ++m) {
       f[m] += cc * p;
       p *= a;
     }
   }
 }
 
-void dirft1d2(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT ms, CPX* f)
+void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f)
 /* Direct computation of 1D type-2 nonuniform FFT. Interface same as finufft1d2
 c
-c     c[j] = SUM   f[k1] exp(+-i k1 x[j]) 
-c             k1  
+c     c[j] = SUM   f[k1] exp(+-i k1 x[j])
+c             k1
 c                            for j = 0,...,nj-1
 c     where sum is over -ms/2 <= k1 <= (ms-1)/2.
 c     The input array is in increasing k1 ordering. If iflag>0 the + sign is
@@ -42,12 +42,12 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 1/25/17
 */
 {
-  BIGINT kmin = -(ms/2);                     // integer divide
-  for (BIGINT j=0;j<nj;++j) {
-    CPX a = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX p = pow(a,(FLT)kmin);   // starting phase for most neg freq
-    CPX cc = CPX(0,0);
-    for (BIGINT m=0;m<ms;++m) {
+  BIGINT kmin = -(ms / 2); // integer divide
+  for (BIGINT j = 0; j < nj; ++j) {
+    CPX a  = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX p  = pow(a, (FLT)kmin); // starting phase for most neg freq
+    CPX cc = CPX(0, 0);
+    for (BIGINT m = 0; m < ms; ++m) {
       cc += f[m] * p;
       p *= a;
     }
@@ -55,20 +55,19 @@ c     used, otherwise the - sign is used, in the exponential.
   }
 }
 
-void dirft1d3(BIGINT nj,FLT* x,CPX* c,int iflag,BIGINT nk, FLT* s, CPX* f)
+void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f)
 /* Direct computation of 1D type-3 nonuniform FFT. Interface same as finufft1d3
 c              nj-1
-c     f[k]  =  SUM   c[j] exp(+-i s[k] x[j]) 
-c              j=0                   
+c     f[k]  =  SUM   c[j] exp(+-i s[k] x[j])
+c              j=0
 c                    for k = 0, ..., nk-1
 c  If iflag>0 the + sign is used, otherwise the - sign is used, in the
 c  exponential. Uses C++ complex type. Simple brute force.  Barnett 1/25/17
 */
 {
-  for (BIGINT k=0;k<nk;++k) {
-    CPX ss = (iflag>0) ? IMA*s[k] : -IMA*s[k];
-    f[k] = CPX(0,0);
-    for (BIGINT j=0;j<nj;++j)
-      f[k] += c[j] * exp(ss*x[j]);
+  for (BIGINT k = 0; k < nk; ++k) {
+    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
+    f[k]   = CPX(0, 0);
+    for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j]);
   }
 }
diff --git a/test/directft/dirft2d.cpp b/test/directft/dirft2d.cpp
index 4f91141f6..c13661549 100644
--- a/test/directft/dirft2d.cpp
+++ b/test/directft/dirft2d.cpp
@@ -1,11 +1,11 @@
-#include <finufft/dirft.h>
 #include <finufft/defs.h>
+#include <finufft/dirft.h>
 #include <iostream>
 
 // This is basically a port of dirft2d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft2d1(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f)
+void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f)
 /* Direct computation of 2D type-1 nonuniform FFT. Interface same as finufft2d1.
 c                  nj-1
 c     f[k1,k2] =   SUM  c[j] exp(+-i (k1 x[j] + k2 y[j]))
@@ -18,32 +18,32 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 1/26/17
 */
 {
-  BIGINT k1min = -(ms/2), k2min = -(mt/2);                 // integer divide
-  BIGINT N = ms*mt;        // total # output modes
-  for (BIGINT m=0;m<N;++m) f[m] = CPX(0,0);    // it knows f is complex type
-  for (BIGINT j=0;j<nj;++j) {            // src pts
-    CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
-    CPX sp1 = pow(a1,(FLT)k1min);  // starting phase for most neg k1 freq  
-    CPX p2 = pow(a2,(FLT)k2min);
-    CPX cc = c[j];                 // no 1/nj norm
-    BIGINT m=0;      // output pointer
-    for (BIGINT m2=0;m2<mt;++m2) {
-      CPX p1 = sp1;                // must reset p1 for each inner loop
-      for (BIGINT m1=0;m1<ms;++m1) {  // ms is fast, mt slow
-	f[m++] += cc * p1 * p2;
-	p1 *= a1;
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2);     // integer divide
+  BIGINT N = ms * mt;                              // total # output modes
+  for (BIGINT m = 0; m < N; ++m) f[m] = CPX(0, 0); // it knows f is complex type
+  for (BIGINT j = 0; j < nj; ++j) {                // src pts
+    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
+    CPX sp1  = pow(a1, (FLT)k1min);        // starting phase for most neg k1 freq
+    CPX p2   = pow(a2, (FLT)k2min);
+    CPX cc   = c[j];                       // no 1/nj norm
+    BIGINT m = 0;                          // output pointer
+    for (BIGINT m2 = 0; m2 < mt; ++m2) {
+      CPX p1 = sp1;                        // must reset p1 for each inner loop
+      for (BIGINT m1 = 0; m1 < ms; ++m1) { // ms is fast, mt slow
+        f[m++] += cc * p1 * p2;
+        p1 *= a1;
       }
       p2 *= a2;
     }
   }
 }
 
-void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX* f)
+void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f)
 /* Direct computation of 2D type-2 nonuniform FFT. Interface same as finufft2d2
 
-     c[j] = SUM   f[k1,k2] exp(+-i (k1 x[j] + k2 y[j])) 
-            k1,k2  
+     c[j] = SUM   f[k1,k2] exp(+-i (k1 x[j] + k2 y[j]))
+            k1,k2
                             for j = 0,...,nj-1
     where sum is over -ms/2 <= k1 <= (ms-1)/2,  -mt/2 <= k2 <= (mt-1)/2.
 
@@ -54,19 +54,19 @@ void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX
     Uses C++ complex type and winding trick.  Barnett 1/26/17
 */
 {
-  BIGINT k1min = -(ms/2), k2min = -(mt/2);                 // integer divide
-  for (BIGINT j=0;j<nj;++j) {
-    CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
-    CPX sp1 = pow(a1,(FLT)k1min);
-    CPX p2 = pow(a2,(FLT)k2min);
-    CPX cc = CPX(0,0);
-    BIGINT m=0;      // input pointer
-    for (BIGINT m2=0;m2<mt;++m2) {
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2); // integer divide
+  for (BIGINT j = 0; j < nj; ++j) {
+    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
+    CPX sp1  = pow(a1, (FLT)k1min);
+    CPX p2   = pow(a2, (FLT)k2min);
+    CPX cc   = CPX(0, 0);
+    BIGINT m = 0; // input pointer
+    for (BIGINT m2 = 0; m2 < mt; ++m2) {
       CPX p1 = sp1;
-      for (BIGINT m1=0;m1<ms;++m1) {
-	cc += f[m++] * p1 * p2;
-	p1 *= a1;
+      for (BIGINT m1 = 0; m1 < ms; ++m1) {
+        cc += f[m++] * p1 * p2;
+        p1 *= a1;
       }
       p2 *= a2;
     }
@@ -74,21 +74,21 @@ void dirft2d2(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT ms, BIGINT mt, CPX
   }
 }
 
-void dirft2d3(BIGINT nj,FLT* x,FLT *y,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, CPX* f)
+void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t,
+              CPX *f)
 /* Direct computation of 2D type-3 nonuniform FFT. Interface same as finufft2d3
 c               nj-1
 c     f[k]  =   SUM   c[j] exp(+-i (s[k] x[j] + t[k] y[j]))
-c               j=0                   
+c               j=0
 c                    for k = 0, ..., nk-1
 c  If iflag>0 the + sign is used, otherwise the - sign is used, in the
 c  exponential. Uses C++ complex type. Simple brute force.  Barnett 1/26/17
 */
 {
-  for (BIGINT k=0;k<nk;++k) {
-    CPX ss = (iflag>0) ? IMA*s[k] : -IMA*s[k];
-    CPX tt = (iflag>0) ? IMA*t[k] : -IMA*t[k];
-    f[k] = CPX(0,0);
-    for (BIGINT j=0;j<nj;++j)
-      f[k] += c[j] * exp(ss*x[j] + tt*y[j]);
+  for (BIGINT k = 0; k < nk; ++k) {
+    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
+    CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k];
+    f[k]   = CPX(0, 0);
+    for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j]);
   }
 }
diff --git a/test/directft/dirft3d.cpp b/test/directft/dirft3d.cpp
index 63e002283..452b62471 100644
--- a/test/directft/dirft3d.cpp
+++ b/test/directft/dirft3d.cpp
@@ -1,11 +1,12 @@
-#include <finufft/dirft.h>
 #include <finufft/defs.h>
+#include <finufft/dirft.h>
 #include <iostream>
 
 // This is basically a port of dirft2d.f from CMCL package, except with
 // the 1/nj prefactors for type-1 removed.
 
-void dirft3d1(BIGINT nj,FLT* x,FLT *y,FLT *z, CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f)
+void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
+              BIGINT mu, CPX *f)
 /* Direct computation of 3D type-1 nonuniform FFT. Interface same as finufft3d1.
 c                     nj-1
 c     f[k1,k2,k3] =   SUM  c[j] exp(+-i (k1 x[j] + k2 y[j] + k2 z[j]))
@@ -19,38 +20,39 @@ c     used, otherwise the - sign is used, in the exponential.
 *  Uses C++ complex type and winding trick.  Barnett 2/1/17
 */
 {
-  BIGINT k1min = -(ms/2), k2min = -(mt/2), k3min = -(mu/2);   // integer divide
-  BIGINT N = ms*mt*mu;        // total # output modes
-  for (BIGINT m=0;m<N;++m) f[m] = CPX(0,0);    // it knows f is complex type
-  for (BIGINT j=0;j<nj;++j) {            // src pts
-    CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
-    CPX a3 = (iflag>0) ? exp(IMA*z[j]) : exp(-IMA*z[j]);
-    CPX sp1 = pow(a1,(FLT)k1min);  // starting phase for most neg k1 freq
-    CPX sp2 = pow(a2,(FLT)k2min);
-    CPX p3 = pow(a3,(FLT)k3min);
-    CPX cc = c[j];                 // no 1/nj norm
-    BIGINT m=0;      // output pointer
-    for (BIGINT m3=0;m3<mu;++m3) {
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2), k3min = -(mu / 2); // integer divide
+  BIGINT N = ms * mt * mu;                                        // total # output modes
+  for (BIGINT m = 0; m < N; ++m) f[m] = CPX(0, 0); // it knows f is complex type
+  for (BIGINT j = 0; j < nj; ++j) {                // src pts
+    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
+    CPX a3   = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]);
+    CPX sp1  = pow(a1, (FLT)k1min); // starting phase for most neg k1 freq
+    CPX sp2  = pow(a2, (FLT)k2min);
+    CPX p3   = pow(a3, (FLT)k3min);
+    CPX cc   = c[j]; // no 1/nj norm
+    BIGINT m = 0;    // output pointer
+    for (BIGINT m3 = 0; m3 < mu; ++m3) {
       CPX p2 = sp2;
-      for (BIGINT m2=0;m2<mt;++m2) {
-	CPX p1 = sp1;
-	for (BIGINT m1=0;m1<ms;++m1) {
-	  f[m++] += cc * p1 * p2 * p3;
-	  p1 *= a1;
-	}
-	p2 *= a2;
+      for (BIGINT m2 = 0; m2 < mt; ++m2) {
+        CPX p1 = sp1;
+        for (BIGINT m1 = 0; m1 < ms; ++m1) {
+          f[m++] += cc * p1 * p2 * p3;
+          p1 *= a1;
+        }
+        p2 *= a2;
       }
       p3 *= a3;
     }
   }
 }
 
-void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT mt, BIGINT mu, CPX* f)
+void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt,
+              BIGINT mu, CPX *f)
 /* Direct computation of 3D type-2 nonuniform FFT. Interface same as finufft3d2
 
-     c[j] =   SUM    f[k1,k2,k3] exp(+-i (k1 x[j] + k2 y[j] + k3 z[j])) 
-            k1,k2,k3  
+     c[j] =   SUM    f[k1,k2,k3] exp(+-i (k1 x[j] + k2 y[j] + k3 z[j]))
+            k1,k2,k3
                             for j = 0,...,nj-1
     where sum is over -ms/2 <= k1 <= (ms-1)/2,  -mt/2 <= k2 <= (mt-1)/2,
                       -mu/2 <= k3 <= (mu-1)/2.
@@ -62,25 +64,25 @@ void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT
     Uses C++ complex type and winding trick.  Barnett 2/1/17
 */
 {
-  BIGINT k1min = -(ms/2), k2min = -(mt/2), k3min = -(mu/2);  // integer divide
-  for (BIGINT j=0;j<nj;++j) {
-    CPX a1 = (iflag>0) ? exp(IMA*x[j]) : exp(-IMA*x[j]);
-    CPX a2 = (iflag>0) ? exp(IMA*y[j]) : exp(-IMA*y[j]);
-    CPX a3 = (iflag>0) ? exp(IMA*z[j]) : exp(-IMA*z[j]);
-    CPX sp1 = pow(a1,(FLT)k1min);
-    CPX sp2 = pow(a2,(FLT)k2min);
-    CPX p3 = pow(a3,(FLT)k3min);
-    CPX cc = CPX(0,0);
-    BIGINT m=0;      // input pointer
-    for (BIGINT m3=0;m3<mu;++m3) {
+  BIGINT k1min = -(ms / 2), k2min = -(mt / 2), k3min = -(mu / 2); // integer divide
+  for (BIGINT j = 0; j < nj; ++j) {
+    CPX a1   = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]);
+    CPX a2   = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]);
+    CPX a3   = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]);
+    CPX sp1  = pow(a1, (FLT)k1min);
+    CPX sp2  = pow(a2, (FLT)k2min);
+    CPX p3   = pow(a3, (FLT)k3min);
+    CPX cc   = CPX(0, 0);
+    BIGINT m = 0; // input pointer
+    for (BIGINT m3 = 0; m3 < mu; ++m3) {
       CPX p2 = sp2;
-      for (BIGINT m2=0;m2<mt;++m2) {
-	CPX p1 = sp1;
-	for (BIGINT m1=0;m1<ms;++m1) {
-	  cc += f[m++] * p1 * p2 * p3;
-	  p1 *= a1;
-	}
-	p2 *= a2;
+      for (BIGINT m2 = 0; m2 < mt; ++m2) {
+        CPX p1 = sp1;
+        for (BIGINT m1 = 0; m1 < ms; ++m1) {
+          cc += f[m++] * p1 * p2 * p3;
+          p1 *= a1;
+        }
+        p2 *= a2;
       }
       p3 *= a3;
     }
@@ -88,22 +90,22 @@ void dirft3d2(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT ms, BIGINT
   }
 }
 
-void dirft3d3(BIGINT nj,FLT* x,FLT *y,FLT *z,CPX* c,int iflag,BIGINT nk, FLT* s, FLT* t, FLT *u, CPX* f)
+void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s,
+              FLT *t, FLT *u, CPX *f)
 /* Direct computation of 3D type-3 nonuniform FFT. Interface same as finufft3d3
 c               nj-1
 c     f[k]  =   SUM   c[j] exp(+-i (s[k] x[j] + t[k] y[j] + u[k] z[j]))
-c               j=0                   
+c               j=0
 c                    for k = 0, ..., nk-1
 c  If iflag>0 the + sign is used, otherwise the - sign is used, in the
 c  exponential. Uses C++ complex type. Simple brute force.  Barnett 2/1/17
 */
 {
-  for (BIGINT k=0;k<nk;++k) {
-    CPX ss = (iflag>0) ? IMA*s[k] : -IMA*s[k];
-    CPX tt = (iflag>0) ? IMA*t[k] : -IMA*t[k];
-    CPX uu = (iflag>0) ? IMA*u[k] : -IMA*u[k];
-    f[k] = CPX(0,0);
-    for (BIGINT j=0;j<nj;++j)
-      f[k] += c[j] * exp(ss*x[j] + tt*y[j] + uu*z[j]);
+  for (BIGINT k = 0; k < nk; ++k) {
+    CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k];
+    CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k];
+    CPX uu = (iflag > 0) ? IMA * u[k] : -IMA * u[k];
+    f[k]   = CPX(0, 0);
+    for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j] + uu * z[j]);
   }
 }
diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp
index b1e8bc6a9..5e74d6d30 100644
--- a/test/dumbinputs.cpp
+++ b/test/dumbinputs.cpp
@@ -6,7 +6,7 @@
 
    Usage (linux):  ./dumbinputs{f} 2> /dev/null
    (since FINUFFT will spit msgs to stderr, to be ignored)
-   
+
    Pass: exit code 0. (Stdout should indicate passed)
    Fail: exit code>0. (Stdout may indicate what failed)
 
@@ -24,61 +24,62 @@
    Removed the chkbnds case to 1d1, 05/08/2024.
 
    Suggested compile:
-   g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lfftw3_omp -lm
-   g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputsf -lfftw3 -lfftw3_omp -lm -DSINGLE
+   g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs
+   -lfftw3 -lfftw3_omp -lm g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include
+   ../lib/libfinufft.so -o dumbinputsf -lfftw3 -lfftw3_omp -lm -DSINGLE
 
    or if you have built a single-core version:
-   g++ -std=c++14 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lm
-   etc
+   g++ -std=c++14 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3
+   -lm etc
 */
 
 // This switches FLT macro from double to float if SINGLE is defined, etc...
-#include <finufft/test_defs.h>
 #include "directft/dirft1d.cpp"
 #include "directft/dirft2d.cpp"
 #include "directft/dirft3d.cpp"
+#include <finufft/test_defs.h>
 using namespace std;
-using namespace finufft::utils;        // for twonorm, etc
+using namespace finufft::utils; // for twonorm, etc
 
-int main(int argc, char* argv[])
-{
-  int M = 100;            // number of nonuniform points
-  int N = 10;             // # modes, keep small, also output NU pts in type 3
+int main(int argc, char *argv[]) {
+  int M = 100;    // number of nonuniform points
+  int N = 10;     // # modes, keep small, also output NU pts in type 3
 #ifdef SINGLE
-  FLT acc = 1e-5;         // desired accuracy for NUFFTs  (prec-dep)
+  FLT acc = 1e-5; // desired accuracy for NUFFTs  (prec-dep)
 #else
-  FLT acc = 1e-8;         // desired accuracy for NUFFTs
+  FLT acc = 1e-8; // desired accuracy for NUFFTs
 #endif
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
 
-  int NN = N*N*N;         // modes F alloc size since we'll go to 3d
+  int NN = N * N * N; // modes F alloc size since we'll go to 3d
   // generate some "random" nonuniform points (x) and complex strengths (c):
-  FLT *x = (FLT *)malloc(sizeof(FLT)*M);
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M);
-  for (int j=0; j<M; ++j) {
-    x[j] = PI*cos((FLT)j);                           // deterministic
-    c[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j);
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M);
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M);
+  for (int j = 0; j < M; ++j) {
+    x[j] = PI * cos((FLT)j); // deterministic
+    c[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j);
   }
   // allocate output array F for Fourier modes, fix some type-3 coords...
-  CPX* F = (CPX*)malloc(sizeof(CPX)*NN);
-  FLT *s = (FLT*)malloc(sizeof(FLT)*N);
-  for (int k=0; k<N; ++k) s[k] = 10 * cos(1.2*k);   // normal-sized coords
-  FLT *shuge = (FLT*)malloc(sizeof(FLT)*N);
-  FLT huge = 1e12;                                  // no smaller than MAX_NF
-  for (int k=0; k<N; ++k) shuge[k] = huge * s[k];   // some huge coords
+  CPX *F = (CPX *)malloc(sizeof(CPX) * NN);
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N);
+  for (int k = 0; k < N; ++k) s[k] = 10 * cos(1.2 * k); // normal-sized coords
+  FLT *shuge = (FLT *)malloc(sizeof(FLT) * N);
+  FLT huge   = 1e12;                                    // no smaller than MAX_NF
+  for (int k = 0; k < N; ++k) shuge[k] = huge * s[k];   // some huge coords
 
   // alloc exact output array
-  CPX* Fe = (CPX*)malloc(sizeof(CPX)*NN);
- 
+  CPX *Fe = (CPX *)malloc(sizeof(CPX) * NN);
+
   // some useful debug printing...
-  //for (int k=0;k<N;++k) printf("F[%d] = %g+%gi\n",k,real(F[k]),imag(F[k]));
-  //for (int j=0;j<M;++j) printf("c[%d] = %g+%gi\n",j,real(c[j]),imag(c[j]));
-  //printf("%.3g %3g\n",twonorm(N,F),twonorm(M,c));
-  opts.debug = 0;   // set to 1,2, to debug inside FINUFFT, etc segfaults
+  // for (int k=0;k<N;++k) printf("F[%d] = %g+%gi\n",k,real(F[k]),imag(F[k]));
+  // for (int j=0;j<M;++j) printf("c[%d] = %g+%gi\n",j,real(c[j]),imag(c[j]));
+  // printf("%.3g %3g\n",twonorm(N,F),twonorm(M,c));
+  opts.debug        = 0; // set to 1,2, to debug inside FINUFFT, etc segfaults
   opts.spread_debug = 0;
 
-  opts.nthreads = 1;       // to keep them fast (thread-launch is slow)
-  
+  opts.nthreads = 1; // to keep them fast (thread-launch is slow)
+
 #ifdef SINGLE
   printf("dumbinputsf test start...\n");
 #else
@@ -87,601 +88,627 @@ int main(int argc, char* argv[])
 
   // 111111111111111111111111111111111111111111111111111111111111111111111111
   printf("1D dumb cases.\n");
-  int ier = FINUFFT1D1(M,x,c,+1,0,N,F,&opts);
+  int ier = FINUFFT1D1(M, x, c, +1, 0, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d1 tol=0:\twrong err code %d\n",ier);
+    printf("1d1 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D1(M,x,c,+1,acc,0,F,&opts);
+  ier = FINUFFT1D1(M, x, c, +1, acc, 0, F, &opts);
   if (ier) {
-    printf("1d1 N=0:\tier=%d\n",ier);
+    printf("1d1 N=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT1D1(-1,x,c,+1,acc,0,F,&opts);
+  ier = FINUFFT1D1(-1, x, c, +1, acc, 0, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
-    printf("1d1 M<0:\twrong err code %d\n",ier);
+    printf("1d1 M<0:\twrong err code %d\n", ier);
     return 1;
   }
-  int64_t Mhuge = (int64_t)(1e16);   // cf defs.h MAX_NU_PTS
-  ier = FINUFFT1D1(Mhuge,x,c,+1,acc,0,F,&opts);
+  int64_t Mhuge = (int64_t)(1e16); // cf defs.h MAX_NU_PTS
+  ier           = FINUFFT1D1(Mhuge, x, c, +1, acc, 0, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
-    printf("1d1 M huge:\twrong err code %d\n",ier);
+    printf("1d1 M huge:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D1(0,x,c,+1,acc,N,F,&opts);
-  FLT t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("1d1 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier   = FINUFFT1D1(0, x, c, +1, acc, N, F, &opts);
+  FLT t = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("1d1 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set F for t2
-  ier = FINUFFT1D2(M,x,c,+1,0,N,F,&opts);
+  for (int k = 0; k < NN; ++k)
+    F[k] = sin((FLT)0.7 * k) + IMA * cos((FLT)0.3 * k); // set F for t2
+  ier = FINUFFT1D2(M, x, c, +1, 0, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d2 tol=0:\twrong err code %d\n",ier);
+    printf("1d2 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D2(M,x,c,+1,acc,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("1d2 N=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT1D2(M, x, c, +1, acc, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("1d2 N=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT1D2(0,x,c,+1,acc,N,F,&opts);
+  ier = FINUFFT1D2(0, x, c, +1, acc, N, F, &opts);
   if (ier) {
-    printf("1d2 M=0:\tier=%d\n",ier);
+    printf("1d2 M=0:\tier=%d\n", ier);
     return ier;
   }
-  for (int j=0; j<M; ++j) c[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset c for t3
-  ier = FINUFFT1D3(M,x,c,+1,0,N,s,F,&opts);
+  for (int j = 0; j < M; ++j)
+    c[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // reset c for t3
+  ier = FINUFFT1D3(M, x, c, +1, 0, N, s, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d3 tol=0:\twrong err code %d\n",ier);
+    printf("1d3 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,0,s,F,&opts);
+  ier = FINUFFT1D3(M, x, c, +1, acc, 0, s, F, &opts);
   if (ier) {
-    printf("1d3 nk=0:\tier=%d\n",ier);
+    printf("1d3 nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,-1,s,F,&opts);
+  ier = FINUFFT1D3(M, x, c, +1, acc, -1, s, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
-    printf("1d3 nk=-1:\twrong err code %d\n",ier);
+    printf("1d3 nk=-1:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,Mhuge,s,F,&opts);
+  ier = FINUFFT1D3(M, x, c, +1, acc, Mhuge, s, F, &opts);
   if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) {
-    printf("1d3 nk huge:\twrong err code %d\n",ier);
+    printf("1d3 nk huge:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3(0,x,c,+1,acc,N,s,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("1d3 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT1D3(0, x, c, +1, acc, N, s, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("1d3 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
   // for type 3 only we include crude accuracy check for 1-NUpt (I/O) cases...
-  ier = FINUFFT1D3(1,x,c,+1,acc,N,s,F,&opts);   // XK prod formally 0
-  dirft1d3(1,x,c,+1,N,s,Fe); for (int k=0; k<N; ++k) F[k] -= Fe[k]; // acc chk
-  FLT err = twonorm(N,F)/sqrt((FLT)N);
-  if (ier || err>100*acc) {
-    printf("1d3 M=1:\tier=%d nrm(err)=%.3g\n",ier,err);
+  ier = FINUFFT1D3(1, x, c, +1, acc, N, s, F, &opts); // XK prod formally 0
+  dirft1d3(1, x, c, +1, N, s, Fe);
+  for (int k = 0; k < N; ++k) F[k] -= Fe[k];          // acc chk
+  FLT err = twonorm(N, F) / sqrt((FLT)N);
+  if (ier || err > 100 * acc) {
+    printf("1d3 M=1:\tier=%d nrm(err)=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,1,s,F,&opts);
-  dirft1d3(M,x,c,+1,1,s,Fe);
-  err = abs(F[0]-Fe[0]);
-  if (ier || err>10*acc) {
-    printf("1d3 nk=1:\tier=%d err=%.3g\n",ier,err);
+  ier = FINUFFT1D3(M, x, c, +1, acc, 1, s, F, &opts);
+  dirft1d3(M, x, c, +1, 1, s, Fe);
+  err = abs(F[0] - Fe[0]);
+  if (ier || err > 10 * acc) {
+    printf("1d3 nk=1:\tier=%d err=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3(1,x,c,+1,acc,1,s,F,&opts);
-  dirft1d3(1,x,c,+1,1,s,Fe);
-  err = abs(F[0]-Fe[0]);
-  if (ier || err>10*acc) {
-    printf("1d3 M=nk=1:\tier=%d err=%.3g\n",ier,err);
+  ier = FINUFFT1D3(1, x, c, +1, acc, 1, s, F, &opts);
+  dirft1d3(1, x, c, +1, 1, s, Fe);
+  err = abs(F[0] - Fe[0]);
+  if (ier || err > 10 * acc) {
+    printf("1d3 M=nk=1:\tier=%d err=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3(M,x,c,+1,acc,N,shuge,F,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("1d3 XK prod too big:\twrong error code %d\n",ier);
+  ier = FINUFFT1D3(M, x, c, +1, acc, N, shuge, F, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("1d3 XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  int ndata = 10;                 // how many multiple vectors to test it on
-  CPX* cm = (CPX*)malloc(sizeof(CPX)*M*ndata);
-  CPX* Fm = (CPX*)malloc(sizeof(CPX)*NN*ndata);     // the biggest array
-  for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // set cm for 1d1many
-  ier = FINUFFT1D1MANY(0,M,x,cm,+1,0,N,Fm,&opts);
+  int ndata = 10; // how many multiple vectors to test it on
+  CPX *cm   = (CPX *)malloc(sizeof(CPX) * M * ndata);
+  CPX *Fm   = (CPX *)malloc(sizeof(CPX) * NN * ndata);   // the biggest array
+  for (int j = 0; j < M * ndata; ++j)
+    cm[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // set cm for 1d1many
+  ier = FINUFFT1D1MANY(0, M, x, cm, +1, 0, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("1d1many ndata=0:\twrong err code %d\n",ier);
+    printf("1d1many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D1MANY(ndata,M,x,cm,+1,0,N,Fm,&opts);
+  ier = FINUFFT1D1MANY(ndata, M, x, cm, +1, 0, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d1many tol=0:\twrong err code %d\n",ier);
+    printf("1d1many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D1MANY(ndata,M,x,cm,+1,acc,0,Fm,&opts);
+  ier = FINUFFT1D1MANY(ndata, M, x, cm, +1, acc, 0, Fm, &opts);
   if (ier) {
-    printf("1d1many N=0:\tier=%d\n",ier);
+    printf("1d1many N=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT1D1MANY(ndata,0,x,cm,+1,acc,N,Fm,&opts);
-  t = twonorm(N*ndata,Fm);
-  if (ier || t!=0.0) {
-    printf("1d1many M=0:\tier=%d nrm(Fm)=%.3g\n",ier,t);
+  ier = FINUFFT1D1MANY(ndata, 0, x, cm, +1, acc, N, Fm, &opts);
+  t   = twonorm(N * ndata, Fm);
+  if (ier || t != 0.0) {
+    printf("1d1many M=0:\tier=%d nrm(Fm)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set Fm for 1d2many
-  ier = FINUFFT1D2MANY(0,M,x,cm,+1,0,N,Fm,&opts);
+  for (int k = 0; k < NN * ndata; ++k)
+    Fm[k] = sin((FLT)0.7 * k) + IMA * cos((FLT)0.3 * k); // set Fm for 1d2many
+  ier = FINUFFT1D2MANY(0, M, x, cm, +1, 0, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("1d2many ndata=0:\twrong err code %d\n",ier);
+    printf("1d2many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D2MANY(ndata,M,x,cm,+1,0,N,Fm,&opts);
+  ier = FINUFFT1D2MANY(ndata, M, x, cm, +1, 0, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d2many tol=0:\twrong err code %d\n",ier);
+    printf("1d2many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D2MANY(ndata,M,x,cm,+1,acc,0,Fm,&opts);
-  t = twonorm(N*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("1d2many N=0:\tier=%d nrm(cm)=%.3g\n",ier,t);
+  ier = FINUFFT1D2MANY(ndata, M, x, cm, +1, acc, 0, Fm, &opts);
+  t   = twonorm(N * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("1d2many N=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT1D2MANY(ndata,0,x,cm,+1,acc,N,Fm,&opts);
+  ier = FINUFFT1D2MANY(ndata, 0, x, cm, +1, acc, N, Fm, &opts);
   if (ier) {
-    printf("1d2many M=0:\tier=%d\n",ier);
+    printf("1d2many M=0:\tier=%d\n", ier);
     return ier;
   }
-  for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset cm for 1d3many
-  ier = FINUFFT1D3MANY(0, M,x,cm,+1,acc,N,s,Fm,&opts);
+  for (int j = 0; j < M * ndata; ++j)
+    cm[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // reset cm for 1d3many
+  ier = FINUFFT1D3MANY(0, M, x, cm, +1, acc, N, s, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("1d3many ndata=0:\twrong err code %d\n",ier);
+    printf("1d3many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata, M,x,cm,+1,0,N,s,Fm,&opts);
+  ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, 0, N, s, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("1d3many tol=0:\twrong err code %d\n",ier);
+    printf("1d3many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata, M,x,cm,+1,acc,0,s,Fm,&opts);
+  ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, acc, 0, s, Fm, &opts);
   if (ier) {
-    printf("1d3many nk=0:\tier=%d\n",ier);
+    printf("1d3many nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT1D3MANY(ndata, 0,x,cm,+1,acc,N,s,Fm,&opts);
-  t = twonorm(N,Fm);
+  ier = FINUFFT1D3MANY(ndata, 0, x, cm, +1, acc, N, s, Fm, &opts);
+  t   = twonorm(N, Fm);
   // again, as above, only crude acc tests for 1-NUpt (I/O) case...
-  ier = FINUFFT1D3MANY(ndata, 1,x,cm,+1,acc,N,s,Fm,&opts);   // XK prod formally 0
-  dirft1d3(1,x,c,+1,N,s,Fe); for (int k=0; k<N; ++k) Fm[k] -= Fe[k]; // acc chk
-  err = twonorm(N,Fm)/sqrt((FLT)N);  // rms, to 5e-5 abs; check just first trial
-  if (ier || err>100*acc) {
-    printf("1d3many M=1:\tier=%d nrm(err)=%.3g\n",ier,err);
+  ier = FINUFFT1D3MANY(ndata, 1, x, cm, +1, acc, N, s, Fm, &opts); // XK prod formally 0
+  dirft1d3(1, x, c, +1, N, s, Fe);
+  for (int k = 0; k < N; ++k) Fm[k] -= Fe[k];                      // acc chk
+  err = twonorm(N, Fm) / sqrt((FLT)N); // rms, to 5e-5 abs; check just first trial
+  if (ier || err > 100 * acc) {
+    printf("1d3many M=1:\tier=%d nrm(err)=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata,M,x,cm,+1,acc,1,s,Fm,&opts);
-  dirft1d3(M,x,c,+1,1,s,Fe);
-  err = abs(Fm[0]-Fe[0]);
-  if (ier || err>10*acc) {
-    printf("1d3many nk=1:\tier=%d err=%.3g\n",ier,err);
+  ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, acc, 1, s, Fm, &opts);
+  dirft1d3(M, x, c, +1, 1, s, Fe);
+  err = abs(Fm[0] - Fe[0]);
+  if (ier || err > 10 * acc) {
+    printf("1d3many nk=1:\tier=%d err=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata,1,x,cm,+1,acc,1,s,Fm,&opts);
-  dirft1d3(1,x,c,+1,1,s,Fe);
-  err = abs(Fm[0]-Fe[0]);
-  if (ier || err>10*acc) {
-    printf("1d3many M=nk=1:\tier=%d err=%.3g\n",ier,err);
+  ier = FINUFFT1D3MANY(ndata, 1, x, cm, +1, acc, 1, s, Fm, &opts);
+  dirft1d3(1, x, c, +1, 1, s, Fe);
+  err = abs(Fm[0] - Fe[0]);
+  if (ier || err > 10 * acc) {
+    printf("1d3many M=nk=1:\tier=%d err=%.3g\n", ier, err);
     return 1;
   }
-  ier = FINUFFT1D3MANY(ndata,M,x,cm,+1,acc,N,shuge,Fm,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("1d3many XK prod too big:\twrong error code %d\n",ier);
+  ier = FINUFFT1D3MANY(ndata, M, x, cm, +1, acc, N, shuge, Fm, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("1d3many XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
 
   // 2222222222222222222222222222222222222222222222222222222222222222222222222
   printf("2D dumb cases.\n"); // (uses y=x, and t=s in type 3)
-  ier = FINUFFT2D1(M,x,x,c,+1,0,N,N,F,&opts);
+  ier = FINUFFT2D1(M, x, x, c, +1, 0, N, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d1 tol=0:\twrong err code %d\n",ier);
+    printf("2d1 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D1(M,x,x,c,+1,acc,0,0,F,&opts);
+  ier = FINUFFT2D1(M, x, x, c, +1, acc, 0, 0, F, &opts);
   if (ier) {
-    printf("2d1 Ns=Nt=0:\tier=%d\n",ier);
+    printf("2d1 Ns=Nt=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1(M,x,x,c,+1,acc,0,N,F,&opts);
+  ier = FINUFFT2D1(M, x, x, c, +1, acc, 0, N, F, &opts);
   if (ier) {
-    printf("2d1 Ns=0,Nt>0:\tier=%d\n",ier);
+    printf("2d1 Ns=0,Nt>0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1(M,x,x,c,+1,acc,N,0,F,&opts);
+  ier = FINUFFT2D1(M, x, x, c, +1, acc, N, 0, F, &opts);
   if (ier) {
-    printf("2d1 Ns>0,Nt=0:\tier=%d\n",ier);
+    printf("2d1 Ns>0,Nt=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1(0,x,x,c,+1,acc,N,N,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("2d1 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT2D1(0, x, x, c, +1, acc, N, N, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("2d1 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set F for t2
-  ier = FINUFFT2D2(M,x,x,c,+1,0,N,N,F,&opts);
+  for (int k = 0; k < NN; ++k)
+    F[k] = sin((FLT)0.7 * k) + IMA * cos((FLT)0.3 * k); // set F for t2
+  ier = FINUFFT2D2(M, x, x, c, +1, 0, N, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d2 tol=0:\twrong err code %d\n",ier);
+    printf("2d2 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D2(M,x,x,c,+1,acc,0,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("2d2 Ns=Nt=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT2D2(M, x, x, c, +1, acc, 0, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("2d2 Ns=Nt=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2(M,x,x,c,+1,acc,0,N,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("2d2 Ns=0,Nt>0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT2D2(M, x, x, c, +1, acc, 0, N, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("2d2 Ns=0,Nt>0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2(M,x,x,c,+1,acc,N,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("2d2 Ns>0,Nt=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT2D2(M, x, x, c, +1, acc, N, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("2d2 Ns>0,Nt=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2(0,x,x,c,+1,acc,N,N,F,&opts);
+  ier = FINUFFT2D2(0, x, x, c, +1, acc, N, N, F, &opts);
   if (ier) {
-    printf("2d2 M=0:\tier=%d\n",ier);
+    printf("2d2 M=0:\tier=%d\n", ier);
     return ier;
   }
-  for (int j=0; j<M; ++j) c[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset c for t3
-  ier = FINUFFT2D3(M,x,x,c,+1,0,N,s,s,F,&opts);
+  for (int j = 0; j < M; ++j)
+    c[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // reset c for t3
+  ier = FINUFFT2D3(M, x, x, c, +1, 0, N, s, s, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d3 tol=0:\twrong err code %d\n",ier);
+    printf("2d3 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D3(M,x,x,c,+1,acc,0,s,s,F,&opts);
+  ier = FINUFFT2D3(M, x, x, c, +1, acc, 0, s, s, F, &opts);
   if (ier) {
-    printf("2d3 nk=0:\tier=%d\n",ier);
+    printf("2d3 nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D3(0,x,x,c,+1,acc,N,s,s,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("2d3 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT2D3(0, x, x, c, +1, acc, N, s, s, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("2d3 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D3(1,x,x,c,+1,acc,N,s,s,F,&opts);   // XK prod formally 0
+  ier = FINUFFT2D3(1, x, x, c, +1, acc, N, s, s, F, &opts); // XK prod formally 0
   // we don't check the M=nk=1 case for >1D since guess that 1D would catch it.
   if (ier) {
-    printf("2d3 M=nk=1:\tier=%d\n",ier);
+    printf("2d3 M=nk=1:\tier=%d\n", ier);
     return ier;
-  }  
-  for (int k=0; k<N; ++k) shuge[k] = sqrt(huge)*s[k];     // less huge coords
-  ier = FINUFFT2D3(M,x,x,c,+1,acc,N,shuge,shuge,F,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("2d3 XK prod too big:\twrong error code %d\n",ier);
+  }
+  for (int k = 0; k < N; ++k) shuge[k] = sqrt(huge) * s[k]; // less huge coords
+  ier = FINUFFT2D3(M, x, x, c, +1, acc, N, shuge, shuge, F, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("2d3 XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset cm for 2d1many
-  ier = FINUFFT2D1MANY(0,M,x,x,cm,+1,0,N,N,Fm,&opts);
+  for (int j = 0; j < M * ndata; ++j)
+    cm[j] = sin((FLT)1.3 * j) + IMA * cos((FLT)0.9 * j); // reset cm for 2d1many
+  ier = FINUFFT2D1MANY(0, M, x, x, cm, +1, 0, N, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("2d1many ndata=0:\twrong err code %d\n",ier);
+    printf("2d1many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,0,N,N,Fm,&opts);
+  ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, 0, N, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d1many tol=0:\twrong err code %d\n",ier);
+    printf("2d1many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,acc,0,0,Fm,&opts);
+  ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, acc, 0, 0, Fm, &opts);
   if (ier) {
-    printf("2d1many Ns=Nt=0:\tier=%d\n",ier);
+    printf("2d1many Ns=Nt=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,acc,0,N,Fm,&opts);
+  ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, acc, 0, N, Fm, &opts);
   if (ier) {
-    printf("2d1many Ns=0,Nt>0:\tier=%d\n",ier);
+    printf("2d1many Ns=0,Nt>0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,acc,N,0,Fm,&opts);
+  ier = FINUFFT2D1MANY(ndata, M, x, x, cm, +1, acc, N, 0, Fm, &opts);
   if (ier) {
-    printf("2d1many Ns>0,Nt=0:\tier=%d\n",ier);
+    printf("2d1many Ns>0,Nt=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D1MANY(ndata,0,x,x,cm,+1,acc,N,N,Fm,&opts);
-  t = twonorm(N*ndata,Fm);
-  if (ier || t!=0.0) {
-    printf("2d1many M=0:\tier=%d nrm(Fm)=%.3g\n",ier,t);
+  ier = FINUFFT2D1MANY(ndata, 0, x, x, cm, +1, acc, N, N, Fm, &opts);
+  t   = twonorm(N * ndata, Fm);
+  if (ier || t != 0.0) {
+    printf("2d1many M=0:\tier=%d nrm(Fm)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // reset Fm for t2
-  ier = FINUFFT2D2MANY(0,M,x,x,cm,+1,0,N,N,Fm,&opts);
+  for (int k = 0; k < NN * ndata; ++k)
+    Fm[k] = sin((FLT)0.7 * k) + IMA * cos((FLT)0.3 * k); // reset Fm for t2
+  ier = FINUFFT2D2MANY(0, M, x, x, cm, +1, 0, N, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("2d2many ndata=0:\twrong err code %d\n",ier);
+    printf("2d2many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,0,N,N,Fm,&opts);
+  ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, 0, N, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d2many tol=0:\twrong err code %d\n",ier);
+    printf("2d2many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,acc,0,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("2d2many Ns=Nt=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, acc, 0, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("2d2many Ns=Nt=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,acc,0,N,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("2d2many Ns=0,Nt>0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, acc, 0, N, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("2d2many Ns=0,Nt>0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,acc,N,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("2d2many Ns>0,Nt=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT2D2MANY(ndata, M, x, x, cm, +1, acc, N, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("2d2many Ns>0,Nt=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D2MANY(ndata,0,x,x,cm,+1,acc,N,N,Fm,&opts);
+  ier = FINUFFT2D2MANY(ndata, 0, x, x, cm, +1, acc, N, N, Fm, &opts);
   if (ier) {
-    printf("2d2many M=0:\tier=%d\n",ier);
+    printf("2d2many M=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D3MANY(0,M,x,x,cm,+1,0,N,s,s,Fm,&opts);
+  ier = FINUFFT2D3MANY(0, M, x, x, cm, +1, 0, N, s, s, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("2d3many ndata=0:\twrong err code %d\n",ier);
+    printf("2d3many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,0,N,s,s,Fm,&opts);
+  ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, 0, N, s, s, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("2d3many tol=0:\twrong err code %d\n",ier);
+    printf("2d3many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,acc,0,s,s,Fm,&opts);
+  ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, acc, 0, s, s, Fm, &opts);
   if (ier) {
-    printf("2d3many nk=0:\tier=%d\n",ier);
+    printf("2d3many nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D3MANY(ndata,0,x,x,cm,+1,acc,N,s,s,Fm,&opts);
-  t = twonorm(N,Fm);
-  if (ier || t!=0.0) {
-    printf("2d3many M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT2D3MANY(ndata, 0, x, x, cm, +1, acc, N, s, s, Fm, &opts);
+  t   = twonorm(N, Fm);
+  if (ier || t != 0.0) {
+    printf("2d3many M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT2D3MANY(ndata,1,x,x,cm,+1,acc,N,s,s,Fm,&opts); // XK prod formally 0
+  ier = FINUFFT2D3MANY(ndata, 1, x, x, cm, +1, acc, N, s, s, Fm, &opts); // XK prod
+                                                                         // formally 0
   // we don't check the M=nk=1 case for >1D since guess that 1D would catch it.
   if (ier) {
-    printf("2d3many M=nk=1:\tier=%d\n",ier);
+    printf("2d3many M=nk=1:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,acc,N,shuge,shuge,Fm,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("2d3many XK prod too big:\twrong error code %d\n",ier);
+  ier = FINUFFT2D3MANY(ndata, M, x, x, cm, +1, acc, N, shuge, shuge, Fm, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("2d3many XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  
+
   // 3333333333333333333333333333333333333333333333333333333333333333333333333
-  printf("3D dumb cases.\n");    // z=y=x, and u=t=s in type 3
-  ier = FINUFFT3D1(M,x,x,x,c,+1,0,N,N,N,F,&opts);
+  printf("3D dumb cases.\n"); // z=y=x, and u=t=s in type 3
+  ier = FINUFFT3D1(M, x, x, x, c, +1, 0, N, N, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d1 tol=0:\twrong err code %d\n",ier);
+    printf("3d1 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D1(M,x,x,x,c,+1,acc,0,0,0,F,&opts);
+  ier = FINUFFT3D1(M, x, x, x, c, +1, acc, 0, 0, 0, F, &opts);
   if (ier) {
-    printf("3d1 Ns=Nt=Nu=0:\tier=%d\n",ier);
+    printf("3d1 Ns=Nt=Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1(M,x,x,x,c,+1,acc,0,N,0,F,&opts);
+  ier = FINUFFT3D1(M, x, x, x, c, +1, acc, 0, N, 0, F, &opts);
   if (ier) {
-    printf("3d1 Ns=0,Nt>0,Nu=0:\tier=%d\n",ier);
+    printf("3d1 Ns=0,Nt>0,Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1(M,x,x,x,c,+1,acc,N,0,N,F,&opts);
+  ier = FINUFFT3D1(M, x, x, x, c, +1, acc, N, 0, N, F, &opts);
   if (ier) {
-    printf("3d1 Ns>0,Nt=0,Nu>0:\tier=%d\n",ier);
+    printf("3d1 Ns>0,Nt=0,Nu>0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1(0,x,x,x,c,+1,acc,N,N,N,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("3d1 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT3D1(0, x, x, x, c, +1, acc, N, N, N, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("3d1 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.8*k) - IMA*cos((FLT)0.3*k);  // set F for t2
-  ier = FINUFFT3D2(M,x,x,x,c,+1,0,N,N,N,F,&opts);
+  for (int k = 0; k < NN; ++k)
+    F[k] = sin((FLT)0.8 * k) - IMA * cos((FLT)0.3 * k); // set F for t2
+  ier = FINUFFT3D2(M, x, x, x, c, +1, 0, N, N, N, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d2 tol=0:\twrong err code %d\n",ier);
+    printf("3d2 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D2(M,x,x,x,c,+1,acc,0,0,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("3d2 Ns=Nt=Nu=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT3D2(M, x, x, x, c, +1, acc, 0, 0, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("3d2 Ns=Nt=Nu=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2(M,x,x,x,c,+1,acc,N,0,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("3d2 Ns>0,Nt=Nu=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT3D2(M, x, x, x, c, +1, acc, N, 0, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("3d2 Ns>0,Nt=Nu=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2(M,x,x,x,c,+1,acc,0,N,0,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("3d2 Ns=0,Nt>0,Nu=0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT3D2(M, x, x, x, c, +1, acc, 0, N, 0, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("3d2 Ns=0,Nt>0,Nu=0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2(M,x,x,x,c,+1,acc,0,0,N,F,&opts);
-  t = twonorm(M,c);
-  if (ier || t!=0.0) {
-    printf("3d2 Ns=Nt=0,Nu>0:\tier=%d nrm(c)=%.3g\n",ier,t);
+  ier = FINUFFT3D2(M, x, x, x, c, +1, acc, 0, 0, N, F, &opts);
+  t   = twonorm(M, c);
+  if (ier || t != 0.0) {
+    printf("3d2 Ns=Nt=0,Nu>0:\tier=%d nrm(c)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2(0,x,x,x,c,+1,acc,N,N,N,F,&opts);
+  ier = FINUFFT3D2(0, x, x, x, c, +1, acc, N, N, N, F, &opts);
   if (ier) {
-    printf("3d2 M=0:\tier=%d\n",ier);
+    printf("3d2 M=0:\tier=%d\n", ier);
     return ier;
   }
-  for (int j=0; j<M; ++j) c[j] = sin((FLT)1.2*j) - IMA*cos((FLT)0.8*j); // reset c for t3
-  ier = FINUFFT3D3(M,x,x,x,c,+1,0,N,s,s,s,F,&opts);
+  for (int j = 0; j < M; ++j)
+    c[j] = sin((FLT)1.2 * j) - IMA * cos((FLT)0.8 * j); // reset c for t3
+  ier = FINUFFT3D3(M, x, x, x, c, +1, 0, N, s, s, s, F, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d3 tol=0:\twrong err code %d\n",ier);
+    printf("3d3 tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D3(M,x,x,x,c,+1,acc,0,s,s,s,F,&opts);
+  ier = FINUFFT3D3(M, x, x, x, c, +1, acc, 0, s, s, s, F, &opts);
   if (ier) {
-    printf("3d3 nk=0:\tier=%d\n",ier);
+    printf("3d3 nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D3(0,x,x,x,c,+1,acc,N,s,s,s,F,&opts);
-  t = twonorm(N,F);
-  if (ier || t!=0.0) {
-    printf("3d3 M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT3D3(0, x, x, x, c, +1, acc, N, s, s, s, F, &opts);
+  t   = twonorm(N, F);
+  if (ier || t != 0.0) {
+    printf("3d3 M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D3(1,x,x,x,c,+1,acc,N,s,s,s,F,&opts);   // XK prod formally 0
+  ier = FINUFFT3D3(1, x, x, x, c, +1, acc, N, s, s, s, F, &opts); // XK prod formally 0
   // we don't check the M=nk=1 case for >1D since guess that 1D would catch it.
   if (ier) {
-    printf("3d3 M=nk=1:\tier=%d\n",ier);
+    printf("3d3 M=nk=1:\tier=%d\n", ier);
     return ier;
   }
-  for (int k=0; k<N; ++k) shuge[k] = pow(huge,1./3)*s[k];  // less huge coords
-  ier = FINUFFT3D3(M,x,x,x,c,+1,acc,N,shuge,shuge,shuge,F,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("3d3 XK prod too big:\twrong error code %d\n",ier);
+  for (int k = 0; k < N; ++k) shuge[k] = pow(huge, 1. / 3) * s[k]; // less huge coords
+  ier = FINUFFT3D3(M, x, x, x, c, +1, acc, N, shuge, shuge, shuge, F, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("3d3 XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  for (int j=0; j<M*ndata; ++j) cm[j] = sin(-(FLT)1.2*j) + IMA*cos((FLT)1.1*j); // reset cm for 3d1many
-  ier = FINUFFT3D1MANY(0,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
+  for (int j = 0; j < M * ndata; ++j)
+    cm[j] = sin(-(FLT)1.2 * j) + IMA * cos((FLT)1.1 * j); // reset cm for 3d1many
+  ier = FINUFFT3D1MANY(0, M, x, x, x, cm, +1, 0, N, N, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("3d1many ndata=0:\twrong err code %d\n",ier);
+    printf("3d1many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, 0, N, N, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d1many tol=0:\twrong err code %d\n",ier);
+    printf("3d1many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,0,0,0,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, 0, Fm, &opts);
   if (ier) {
-    printf("3d1many Ns=Nt=Nu=0:\tier=%d\n",ier);
+    printf("3d1many Ns=Nt=Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,N,0,0,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, N, 0, 0, Fm, &opts);
   if (ier) {
-    printf("3d1many Ns>0,Nt=Nu=0:\tier=%d\n",ier);
+    printf("3d1many Ns>0,Nt=Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,0,N,0,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, 0, N, 0, Fm, &opts);
   if (ier) {
-    printf("3d1many Ns=0,Nt>0,Nu=0:\tier=%d\n",ier);
+    printf("3d1many Ns=0,Nt>0,Nu=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,acc,0,0,N,Fm,&opts);
+  ier = FINUFFT3D1MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, N, Fm, &opts);
   if (ier) {
-    printf("3d1many Ns=Nt=0,Nu>0:\tier=%d\n",ier);
+    printf("3d1many Ns=Nt=0,Nu>0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D1MANY(ndata,0,x,x,x,cm,+1,acc,N,N,N,Fm,&opts);
-  t = twonorm(N*ndata,Fm);
-  if (ier || t!=0.0) {
-    printf("3d1many M=0:\tier=%d nrm(Fm)=%.3g\n",ier,t);
+  ier = FINUFFT3D1MANY(ndata, 0, x, x, x, cm, +1, acc, N, N, N, Fm, &opts);
+  t   = twonorm(N * ndata, Fm);
+  if (ier || t != 0.0) {
+    printf("3d1many M=0:\tier=%d nrm(Fm)=%.3g\n", ier, t);
     return 1;
   }
-  for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.6*k) - IMA*cos((FLT)0.3*k);  // reset Fm for t2
-  ier = FINUFFT3D2MANY(0,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
+  for (int k = 0; k < NN * ndata; ++k)
+    Fm[k] = sin((FLT)0.6 * k) - IMA * cos((FLT)0.3 * k); // reset Fm for t2
+  ier = FINUFFT3D2MANY(0, M, x, x, x, cm, +1, 0, N, N, N, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("3d2many ndata=0:\twrong err code %d\n",ier);
+    printf("3d2many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, 0, N, N, N, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d2many tol=0:\twrong err code %d\n",ier);
+    printf("3d2many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,0,0,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("3d2many Ns=Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("3d2many Ns=Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,N,0,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("3d2many Ns>0,Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, N, 0, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("3d2many Ns>0,Nt=Nu=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,0,N,0,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("3d2many Ns=0,Nt>0,Nu=0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, 0, N, 0, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("3d2many Ns=0,Nt>0,Nu=0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,acc,0,0,N,Fm,&opts);
-  t = twonorm(M*ndata,cm);
-  if (ier || t!=0.0) {
-    printf("3d2many Ns=Nt=0,Nu>0:\tier=%d nrm(cm)=%.3g\n", ier,t);
+  ier = FINUFFT3D2MANY(ndata, M, x, x, x, cm, +1, acc, 0, 0, N, Fm, &opts);
+  t   = twonorm(M * ndata, cm);
+  if (ier || t != 0.0) {
+    printf("3d2many Ns=Nt=0,Nu>0:\tier=%d nrm(cm)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D2MANY(ndata,0,x,x,x,cm,+1,acc,N,N,N,Fm,&opts);
+  ier = FINUFFT3D2MANY(ndata, 0, x, x, x, cm, +1, acc, N, N, N, Fm, &opts);
   if (ier) {
-    printf("3d2many M=0:\tier=%d\n",ier);
+    printf("3d2many M=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D3MANY(0,M,x,x,x,cm,+1,0,N,s,s,s,Fm,&opts);
+  ier = FINUFFT3D3MANY(0, M, x, x, x, cm, +1, 0, N, s, s, s, Fm, &opts);
   if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
-    printf("3d3many ndata=0:\twrong err code %d\n",ier);
+    printf("3d3many ndata=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,0,N,s,s,s,Fm,&opts);
+  ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, 0, N, s, s, s, Fm, &opts);
   if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
-    printf("3d3many tol=0:\twrong err code %d\n",ier);
+    printf("3d3many tol=0:\twrong err code %d\n", ier);
     return 1;
   }
-  ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,acc,0,s,s,s,Fm,&opts);
+  ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, acc, 0, s, s, s, Fm, &opts);
   if (ier) {
-    printf("3d3many nk=0:\tier=%d\n",ier);
+    printf("3d3many nk=0:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D3MANY(ndata,0,x,x,x,cm,+1,acc,N,s,s,s,Fm,&opts);
-  t = twonorm(N,Fm);
-  if (ier || t!=0.0) {
-    printf("3d3many M=0:\tier=%d nrm(F)=%.3g\n",ier,t);
+  ier = FINUFFT3D3MANY(ndata, 0, x, x, x, cm, +1, acc, N, s, s, s, Fm, &opts);
+  t   = twonorm(N, Fm);
+  if (ier || t != 0.0) {
+    printf("3d3many M=0:\tier=%d nrm(F)=%.3g\n", ier, t);
     return 1;
   }
-  ier = FINUFFT3D3MANY(ndata,1,x,x,x,cm,+1,acc,N,s,s,s,Fm,&opts); // XK prod formally 0
+  ier = FINUFFT3D3MANY(ndata, 1, x, x, x, cm, +1, acc, N, s, s, s, Fm, &opts); // XK
+                                                                               // prod
+                                                                               // formally
+                                                                               // 0
   // we don't check the M=nk=1 case for >1D since guess that 1D would catch it.
   if (ier) {
-    printf("3d3many M=nk=1:\tier=%d\n",ier);
+    printf("3d3many M=nk=1:\tier=%d\n", ier);
     return ier;
   }
-  ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,acc,N,shuge,shuge,shuge,Fm,&opts);
-  if (ier==0) {          // any nonzero code accepted here
-    printf("3d3many XK prod too big:\twrong error code %d\n",ier);
+  ier = FINUFFT3D3MANY(ndata, M, x, x, x, cm, +1, acc, N, shuge, shuge, shuge, Fm, &opts);
+  if (ier == 0) { // any nonzero code accepted here
+    printf("3d3many XK prod too big:\twrong error code %d\n", ier);
     return 1;
   }
-  
-  free(x); free(c); free(F); free(s); free(shuge); free(cm); free(Fm); free(Fe);
-  
+
+  free(x);
+  free(c);
+  free(F);
+  free(s);
+  free(shuge);
+  free(cm);
+  free(Fm);
+  free(Fe);
+
   // GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
   // some dumb tests for guru interface to induce free() crash in destroy...
   FINUFFT_PLAN plan;
-  BIGINT Ns[1] = {0};      // since dim=1, don't have to make length 3
-  FINUFFT_MAKEPLAN(1, 1, Ns, +1, 1, acc, &plan, NULL);  // type 1, now kill it
+  BIGINT Ns[1] = {0}; // since dim=1, don't have to make length 3
+  FINUFFT_MAKEPLAN(1, 1, Ns, +1, 1, acc, &plan, NULL); // type 1, now kill it
   FINUFFT_DESTROY(plan);
-  FINUFFT_MAKEPLAN(3, 1, Ns, +1, 1, acc, &plan, NULL);  // type 3, now kill it
+  FINUFFT_MAKEPLAN(3, 1, Ns, +1, 1, acc, &plan, NULL); // type 3, now kill it
   FINUFFT_DESTROY(plan);
   // *** todo: more extensive bad inputs and error catching in guru...
-  
+
 #ifdef SINGLE
   printf("dumbinputsf passed.\n");
 #else
   printf("dumbinputs passed.\n");
 #endif
-  
+
   return 0;
 }
diff --git a/test/finufft1d_test.cpp b/test/finufft1d_test.cpp
index 8dd345b1a..b4a946738 100644
--- a/test/finufft1d_test.cpp
+++ b/test/finufft1d_test.cpp
@@ -4,170 +4,184 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 1d, all 3 types, either precision.",
-  "",
-  "Usage: finufft1d_test Nmodes Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]",
-  "\teg:\tfinufft1d_test 1e6 1e6 1e-6 1 2 2.0 1e-5",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {
+    "Tester for FINUFFT in 1d, all 3 types, either precision.",
+    "",
+    "Usage: finufft1d_test Nmodes Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]",
+    "\teg:\tfinufft1d_test 1e6 1e6 1e-6 1 2 2.0 1e-5",
+    "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+    NULL};
 // Barnett 1/22/17 onwards
 
-int main(int argc, char* argv[])
-{
-  BIGINT M, N;   // M = # srcs, N = # modes out
-  double w, tol = 1e-6;         // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N;                // M = # srcs, N = # modes out
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);  // put defaults in opts
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts); // put defaults in opts
   // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  int isign = +1;            // choose which exponential sign to test
-  if (argc<3 || argc>8) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 3 || argc > 8) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); N = (BIGINT)w;
-  sscanf(argv[2],"%lf",&w); M = (BIGINT)w;
-  if (argc>3) sscanf(argv[3],"%lf",&tol);
-  if (argc>4) sscanf(argv[4],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>5) sscanf(argv[5],"%d",&opts.spread_sort);
-  if (argc>6) { sscanf(argv[6],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>7) sscanf(argv[7],"%lf",&errfail);
-  
+  sscanf(argv[1], "%lf", &w);
+  N = (BIGINT)w;
+  sscanf(argv[2], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 3) sscanf(argv[3], "%lf", &tol);
+  if (argc > 4) sscanf(argv[4], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 5) sscanf(argv[5], "%d", &opts.spread_sort);
+  if (argc > 6) {
+    sscanf(argv[6], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 7) sscanf(argv[7], "%lf", &errfail);
+
   cout << scientific << setprecision(15);
 
-  FLT *x = (FLT*)malloc(sizeof(FLT)*M);        // NU pts
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)   // static => non-stochastic
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = PI*randm11r(&se);   // fills [-pi,pi)
+    unsigned int se = MY_OMP_GET_THREAD_NUM();   // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK) // static => non-stochastic
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = PI * randm11r(&se);                 // fills [-pi,pi)
       c[j] = crandm11r(&se);
     }
   }
-  //for (BIGINT j=0; j<M; ++j) x[j] = 0.999 * PI*randm11();  // avoid ends
-  //for (BIGINT j=0; j<M; ++j) x[j] = PI*(2*j/(FLT)M-1);  // test a grid
+  // for (BIGINT j=0; j<M; ++j) x[j] = 0.999 * PI*randm11();  // avoid ends
+  // for (BIGINT j=0; j<M; ++j) x[j] = PI*(2*j/(FLT)M-1);  // test a grid
 
   printf("test 1d type 1:\n"); // -------------- type 1
-  CNTime timer; timer.start();
-  int ier = FINUFFT1D1(M,x,c,isign,tol,N,F,&opts);
-  //for (int j=0;j<N;++j) cout<<F[j]<<endl;
-  double t=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier = FINUFFT1D1(M, x, c, isign, tol, N, F, &opts);
+  // for (int j=0;j<N;++j) cout<<F[j]<<endl;
+  double t = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n",(long long)M,(long long)N,t,M/t);
+    printf("\t%lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", (long long)M,
+           (long long)N, t, M / t);
 
-  BIGINT nt = (BIGINT)(0.37*N);   // check arb choice of mode near the top (N/2)
-//#pragma omp declare reduction (cmplxadd:CPX:omp_out=omp_out+omp_in) initializer(omp_priv={0.0,0.0})  // only for openmp v 4.0!
-  //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft)
+  BIGINT nt = (BIGINT)(0.37 * N); // check arb choice of mode near the top (N/2)
+  // #pragma omp declare reduction (cmplxadd:CPX:omp_out=omp_out+omp_in)
+  // initializer(omp_priv={0.0,0.0})  // only for openmp v 4.0! #pragma omp parallel for
+  // schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft)
   FLT Ftr = 0.0, Fti = 0.0;
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {  // Ft += c[j] * exp(IMA*((FLT)(isign*nt))*x[j])
-    FLT co = cos(((FLT)(isign*nt))*x[j]), si = sin(((FLT)(isign*nt))*x[j]);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) { // Ft += c[j] * exp(IMA*((FLT)(isign*nt))*x[j])
+    FLT co = cos(((FLT)(isign * nt)) * x[j]), si = sin(((FLT)(isign * nt)) * x[j]);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  err = abs(Ftr+IMA*Fti - F[N/2+nt])/infnorm(N,F);
-  printf("\tone mode: rel err in F[%lld] is %.3g\n",(long long)nt,err);
-  errmax = max(err,errmax);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft1d1(M,x,c,isign,N,Ft);
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft1d: rel l2-err of result F is %.3g\n",err);
+  err = abs(Ftr + IMA * Fti - F[N / 2 + nt]) / infnorm(N, F);
+  printf("\tone mode: rel err in F[%lld] is %.3g\n", (long long)nt, err);
+  errmax = max(err, errmax);
+  if (((int64_t)M) * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft1d1(M, x, c, isign, N, Ft);
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft1d: rel l2-err of result F is %.3g\n", err);
     free(Ft);
   }
 
   printf("test 1d type 2:\n"); // -------------- type 2
- #pragma omp parallel
+#pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
   }
   timer.restart();
-  ier = FINUFFT1D2(M,x,c,isign,tol,N,F,&opts);
-  //cout<<"c:\n"; for (int j=0;j<M;++j) cout<<c[j]<<endl;
-  t=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT1D2(M, x, c, isign, tol, N, F, &opts);
+  // cout<<"c:\n"; for (int j=0;j<M;++j) cout<<c[j]<<endl;
+  t = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",(long long)N,(long long)M,t,M/t);
+    printf("\t%lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", (long long)N,
+           (long long)M, t, M / t);
 
-  BIGINT jt = M/2;          // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0, k0 = N/2;          // index shift in fk's = mag of most neg freq
-  //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct)
-  for (BIGINT m1=-k0; m1<=(N-1)/2; ++m1)
-    ct += F[m++] * exp(IMA*((FLT)(isign*m1))*x[jt]);   // crude direct
-  err = abs(ct-c[jt])/infnorm(M,c);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* ct = (CPX*)malloc(sizeof(CPX)*M);
-    dirft1d2(M,x,ct,isign,N,F);
-    err = relerrtwonorm(M,ct,c);
-    errmax = max(err,errmax);
-    printf("\tdirft1d: rel l2-err of result c is %.3g\n",err);
-    //cout<<"c/ct:\n"; for (int j=0;j<M;++j) cout<<c[j]/ct[j]<<endl;
+  BIGINT jt = M / 2;        // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m = 0, k0 = N / 2; // index shift in fk's = mag of most neg freq
+  // #pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct)
+  for (BIGINT m1 = -k0; m1 <= (N - 1) / 2; ++m1)
+    ct += F[m++] * exp(IMA * ((FLT)(isign * m1)) * x[jt]); // crude direct
+  err    = abs(ct - c[jt]) / infnorm(M, c);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] is %.3g\n", (long long)jt, err);
+  if (((int64_t)M) * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *ct = (CPX *)malloc(sizeof(CPX) * M);
+    dirft1d2(M, x, ct, isign, N, F);
+    err    = relerrtwonorm(M, ct, c);
+    errmax = max(err, errmax);
+    printf("\tdirft1d: rel l2-err of result c is %.3g\n", err);
+    // cout<<"c/ct:\n"; for (int j=0;j<M;++j) cout<<c[j]/ct[j]<<endl;
     free(ct);
   }
 
   printf("test 1d type 3:\n"); // -------------- type 3
-  // reuse the strengths c, interpret N as number of targs:
+                               // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) x[j] = 2.0 + PI*randm11r(&se);  // new x_j srcs
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs
   }
-  FLT* s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs
-  FLT S = (FLT)N/2;                   // choose freq range sim to type 1
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N);                          // targ freqs
+  FLT S  = (FLT)N / 2; // choose freq range sim to type 1
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) s[k] = S*(1.7 + randm11r(&se)); //S*(1.7 + k/(FLT)N); // offset
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k)
+      s[k] = S * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset
   }
   timer.restart();
-  ier = FINUFFT1D3(M,x,c,isign,tol,N,s,F,&opts);
-  t=timer.elapsedsec();
-  if (ier>0) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT1D3(M, x, c, isign, tol, N, s, F, &opts);
+  t   = timer.elapsedsec();
+  if (ier > 0) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU to %lld NU in %.3g s        \t%.3g tot NU pts/s\n",(long long)M,(long long)N,t,(M+N)/t);
+    printf("\t%lld NU to %lld NU in %.3g s        \t%.3g tot NU pts/s\n", (long long)M,
+           (long long)N, t, (M + N) / t);
 
-  BIGINT kt = N/2;          // check arbitrary choice of one targ pt
-  Ftr = 0.0;
-  Fti = 0.0;
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {  // Ft += c[j] * exp(IMA*(FLT)isign*s[kt]*x[j])
-    FLT co = cos((FLT)isign*s[kt]*x[j]), si = sin((FLT)isign*s[kt]*x[j]);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT kt = N / 2; // check arbitrary choice of one targ pt
+  Ftr       = 0.0;
+  Fti       = 0.0;
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) { // Ft += c[j] * exp(IMA*(FLT)isign*s[kt]*x[j])
+    FLT co = cos((FLT)isign * s[kt] * x[j]), si = sin((FLT)isign * s[kt] * x[j]);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  err = abs(Ftr+IMA*Fti-F[kt])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] is %.3g\n",(long long)kt,err);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft1d3(M,x,c,isign,N,s,Ft);       // writes to F
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft1d: rel l2-err of result F is %.3g\n",err);
-    //cout<<"s, F, Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<F[k]<<"\t"<<Ft[k]<<"\t"<<F[k]/Ft[k]<<endl;
+  err    = abs(Ftr + IMA * Fti - F[kt]) / infnorm(N, F);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] is %.3g\n", (long long)kt, err);
+  if (((int64_t)M) * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft1d3(M, x, c, isign, N, s, Ft);   // writes to F
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft1d: rel l2-err of result F is %.3g\n", err);
+    // cout<<"s, F, Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<"
+    // "<<F[k]<<"\t"<<Ft[k]<<"\t"<<F[k]/Ft[k]<<endl;
     free(Ft);
   }
 
-  free(x); free(c); free(F); free(s);
-  return (errmax>errfail);
+  free(x);
+  free(c);
+  free(F);
+  free(s);
+  return (errmax > errfail);
 }
diff --git a/test/finufft1dmany_test.cpp b/test/finufft1dmany_test.cpp
index 581c52c2d..e17cbb65e 100644
--- a/test/finufft1dmany_test.cpp
+++ b/test/finufft1dmany_test.cpp
@@ -4,164 +4,172 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 1d, vectorized, all 3 types, either precision.",
-  "",
-  "Usage: finufft1dmany_test ntrans Nmodes Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
-  "\teg:\tfinufft1dmany_test 100 1e3 1e4 1e-6 1 0 0 2 0.0 1e-5",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {
+    "Tester for FINUFFT in 1d, vectorized, all 3 types, either precision.",
+    "",
+    "Usage: finufft1dmany_test ntrans Nmodes Nsrc [tol [debug [spread_thread "
+    "[maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
+    "\teg:\tfinufft1dmany_test 100 1e3 1e4 1e-6 1 0 0 2 0.0 1e-5",
+    "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+    NULL};
 // Malleo 2019 based on Shih 2018. Tidied, extra args, Barnett 5/25/20 onwards
 
-int main(int argc, char* argv[])
-{   
-  BIGINT M, N;                   // M = # srcs, N = # modes
-  int ntransf;                   // # of vectors for "many" interface
-  double w, tol = 1e-6;          // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N;                // M = # srcs, N = # modes
+  int ntransf;                // # of vectors for "many" interface
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
   // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  int isign = +1;             // choose which exponential sign to test
-  if (argc<4 || argc>11) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 4 || argc > 11) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); ntransf = (int)w;
-  sscanf(argv[2],"%lf",&w); N = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); M = (BIGINT)w;
-  if (argc>4) sscanf(argv[4],"%lf",&tol);
-  if (argc>5) sscanf(argv[5],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>6) sscanf(argv[6],"%d",&opts.spread_thread);  
-  if (argc>7) sscanf(argv[7],"%d",&opts.maxbatchsize);    
-  if (argc>8) sscanf(argv[8],"%d",&opts.spread_sort);
-  if (argc>9) { sscanf(argv[9],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>10) sscanf(argv[10],"%lf",&errfail);
+  sscanf(argv[1], "%lf", &w);
+  ntransf = (int)w;
+  sscanf(argv[2], "%lf", &w);
+  N = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 4) sscanf(argv[4], "%lf", &tol);
+  if (argc > 5) sscanf(argv[5], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 6) sscanf(argv[6], "%d", &opts.spread_thread);
+  if (argc > 7) sscanf(argv[7], "%d", &opts.maxbatchsize);
+  if (argc > 8) sscanf(argv[8], "%d", &opts.spread_sort);
+  if (argc > 9) {
+    sscanf(argv[9], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 10) sscanf(argv[10], "%lf", &errfail);
 
   cout << scientific << setprecision(15);
- 
-  FLT* x = (FLT*)malloc(sizeof(FLT)*M);  // NU pts x coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf);   // mode ampls
+
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M);           // NU pts x coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
     }
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j = 0; j<ntransf*M; ++j)
-    {
-        c[j] = crandm11r(&se);
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < ntransf * M; ++j) {
+      c[j] = crandm11r(&se);
     }
   }
 
   printf("test 1d1 many vs repeated single: ------------------------------------\n");
-  CNTime timer; timer.start();
-  int ier = FINUFFT1D1MANY(ntransf,M,x,c,isign,tol,N,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT1D1MANY(ntransf, M, x, c, isign, tol, N, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU pts to %lld modes in %.3g s  \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N,ti,ntransf*M/ti);
-
-  int i = (ntransf-1);    // choose a trial to check
-  BIGINT nt1 = (BIGINT)(0.37*N);  // choose some mode index to check
-  CPX Ft = CPX(0,0), J = IMA*(FLT)isign;
-  for (BIGINT j=0; j<M; ++j)
-    Ft += c[j+i*M] * exp(J*(nt1*x[j]));   // crude direct
-  BIGINT it = N/2+nt1 ; // index in complex F as 1d array
-  err = abs(Ft-F[it+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\tone mode: rel err in F[%lld] of trans#%d is %.3g\n",
-	 (long long)nt1,i,err);
+    printf("ntr=%d: %lld NU pts to %lld modes in %.3g s  \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N, ti, ntransf * M / ti);
+
+  int i      = (ntransf - 1);      // choose a trial to check
+  BIGINT nt1 = (BIGINT)(0.37 * N); // choose some mode index to check
+  CPX Ft = CPX(0, 0), J = IMA * (FLT)isign;
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[j + i * M] * exp(J * (nt1 * x[j])); // crude direct
+  BIGINT it = N / 2 + nt1;                      // index in complex F as 1d array
+  err       = abs(Ft - F[it + i * N]) / infnorm(N, F + i * N);
+  errmax    = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld] of trans#%d is %.3g\n", (long long)nt1, i, err);
 
   // compare the result with FINUFFT1D1
   FFTW_FORGET_WISDOM();
-  CPX * F_1d1 = (CPX *)malloc(sizeof(CPX)*N*ntransf);
-  CPX * Fstart;
-  CPX * cstart;
+  CPX *F_1d1 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
+  CPX *Fstart;
+  CPX *cstart;
   timer.restart();
-  finufft_opts simpleopts = opts;    // opts just for simple interface
-  simpleopts.debug = 0;
-  simpleopts.spread_debug = 0; 
-  for(BIGINT j = 0; j < ntransf; j++){
-    Fstart = F_1d1 + j*N;
-    cstart = c + j*M;
-    FINUFFT1D1(M,x,cstart,isign,tol,N,Fstart,&simpleopts);
+  finufft_opts simpleopts = opts; // opts just for simple interface
+  simpleopts.debug        = 0;
+  simpleopts.spread_debug = 0;
+  for (BIGINT j = 0; j < ntransf; j++) {
+    Fstart = F_1d1 + j * N;
+    cstart = c + j * M;
+    FINUFFT1D1(M, x, cstart, isign, tol, N, Fstart, &simpleopts);
   }
   double t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT1D1 / T_finufft1d1many = %.3g\n", t/ti);
-  
-   // Check consistency (worst over the ntransf)
+    printf("%d of: %lld NU pts to %lld modes in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N, t, ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT1D1 / T_finufft1d1many = %.3g\n", t / ti);
+
+  // Check consistency (worst over the ntransf)
   double maxerror = 0.0;
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,F_1d1+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2  ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, F_1d1 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2  ) =  %.3g\n", maxerror);
   free(F_1d1);
 
-
   printf("test 1d2 many vs repeated single: ------------------------------------\n");
   FFTW_FORGET_WISDOM();
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
   }
   timer.restart();
-  ier = FINUFFT1D2MANY(ntransf, M,x,c,isign,tol,N,F,&opts);
-  //cout<<"c:\n"; for (int j=0;j<M;++j) cout<<c[j]<<endl;
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT1D2MANY(ntransf, M, x, c, isign, tol, N, F, &opts);
+  // cout<<"c:\n"; for (int j=0;j<M;++j) cout<<c[j]<<endl;
+  ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",ntransf,(long long)N,(long long)M,ti,ntransf*M/ti);
-
-  BIGINT jt = M/2;          // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0, k0 = N/2;          // index shift in fk's = mag of most neg freq
-  //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct)
-  for (BIGINT m1=-k0; m1<=(N-1)/2; ++m1)
-    ct += F[i*N + m++] * exp(IMA*((FLT)(isign*m1))*x[jt]);   // crude direct
-  err = abs(ct-c[jt + i*M])/infnorm(M,c+i*M);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err);
+    printf("ntr=%d: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)N, (long long)M, ti, ntransf * M / ti);
+
+  BIGINT jt = M / 2;        // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m = 0, k0 = N / 2; // index shift in fk's = mag of most neg freq
+  // #pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:ct)
+  for (BIGINT m1 = -k0; m1 <= (N - 1) / 2; ++m1)
+    ct += F[i * N + m++] * exp(IMA * ((FLT)(isign * m1)) * x[jt]); // crude direct
+  err    = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err);
 
   // check against single calls to FINUFFT1D2...
   FFTW_FORGET_WISDOM();
-  CPX * c_1d2 = (CPX *)malloc(sizeof(CPX)*M*ntransf);
+  CPX *c_1d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf);
   timer.restart();
-  for(BIGINT j = 0; j < ntransf; j++){
-    Fstart = F + j*N;
-    cstart = c_1d2 + j*M;
-    FINUFFT1D2(M,x,cstart,isign,tol,N,Fstart,&simpleopts);
+  for (BIGINT j = 0; j < ntransf; j++) {
+    Fstart = F + j * N;
+    cstart = c_1d2 + j * M;
+    FINUFFT1D2(M, x, cstart, isign, tol, N, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N,(long long)M,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT1D2 / T_finufft1d2many = %.3g\n", t/ti);
-  
-  maxerror = 0.0;           // worst error over the ntransf
+    printf("%d of: %lld modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)N, (long long)M, t, ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT1D2 / T_finufft1d2many = %.3g\n", t / ti);
+
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(M,c_1d2+k*M,c+k*M));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(M, c_1d2 + k * M, c + k * M));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n", maxerror);
   free(c_1d2);
 
   printf("test 1d3 many vs repeated single: ------------------------------------\n");
@@ -169,68 +177,69 @@ int main(int argc, char* argv[])
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) x[j] = 2.0 + PI*randm11r(&se);  // new x_j srcs
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs
   }
-  FLT* s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs
-  FLT S = (FLT)N/2;                   // choose freq range sim to type 1
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N);                          // targ freqs
+  FLT S  = (FLT)N / 2; // choose freq range sim to type 1
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k)
-      s[k] = S*(1.7 + randm11r(&se)); //S*(1.7 + k/(FLT)N); // offset
-  
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j = 0; j<ntransf*M; ++j) 
-        c[j] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k)
+      s[k] = S * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset
+
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < ntransf * M; ++j) c[j] = crandm11r(&se);
   }
-  
+
   timer.restart();
-  ier = FINUFFT1D3MANY(ntransf, M,x,c,isign,tol,N,s,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT1D3MANY(ntransf, M, x, c, isign, tol, N, s, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n",ntransf,(long long)M,(long long)N,ti,ntransf*(M+N)/ti);
-  
-  BIGINT kt = N/4;          // check arbitrary choice of one targ pt
-  Ft = CPX(0,0);
-  //#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft)
-  for (BIGINT j=0;j<M;++j)
-    Ft += c[j+i*M] * exp(IMA*(FLT)isign*s[kt]*x[j]);
-  err = abs(Ft-F[kt+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
+    printf("ntr=%d: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, ti, ntransf * (M + N) / ti);
+
+  BIGINT kt = N / 4; // check arbitrary choice of one targ pt
+  Ft        = CPX(0, 0);
+  // #pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(cmplxadd:Ft)
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[j + i * M] * exp(IMA * (FLT)isign * s[kt] * x[j]);
+  err    = abs(Ft - F[kt + i * N]) / infnorm(N, F + i * N);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
 
   // compare the result with single calls to FINUFFT1D3...
   FFTW_FORGET_WISDOM();
-  CPX *f_1d3 = (CPX *)malloc(sizeof(CPX)*N*ntransf);
+  CPX *f_1d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for(int k = 0; k < ntransf; k++){
-    cstart = c + k*M;
-    Fstart = f_1d3 + k*N;
-    ier = FINUFFT1D3(M,x,cstart,isign,tol,N,s,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; k++) {
+    cstart = c + k * M;
+    Fstart = f_1d3 + k * N;
+    ier    = FINUFFT1D3(M, x, cstart, isign, tol, N, s, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t);
-  printf("\t\t\tspeedup \t T_FINUFFT1D3 / T_finufft1d3many = %.3g\n", t/ti);
+    printf("%d of: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, t, ntransf * (M + N) / t);
+  printf("\t\t\tspeedup \t T_FINUFFT1D3 / T_finufft1d3many = %.3g\n", t / ti);
 
-  maxerror = 0.0;           // worst error over the ntransf
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,f_1d3+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, f_1d3 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n", maxerror);
   free(f_1d3);
   free(x);
   free(s);
   free(c);
   free(F);
-  return (errmax>errfail);
-}  
+  return (errmax > errfail);
+}
diff --git a/test/finufft2d_test.cpp b/test/finufft2d_test.cpp
index 04945b5f9..5640e29da 100644
--- a/test/finufft2d_test.cpp
+++ b/test/finufft2d_test.cpp
@@ -4,177 +4,195 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 2d, all 3 types, either precision.",
-  "",
-  "Usage: finufft2d_test Nmodes1 Nmodes2 Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]",
-  "\teg:\tfinufft2d_test 1000 1000 1000000 1e-12 1 2 2.0 1e-11",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {"Tester for FINUFFT in 2d, all 3 types, either precision.",
+                      "",
+                      "Usage: finufft2d_test Nmodes1 Nmodes2 Nsrc [tol [debug "
+                      "[spread_sort [upsampfac [errfail]]]]]",
+                      "\teg:\tfinufft2d_test 1000 1000 1000000 1e-12 1 2 2.0 1e-11",
+                      "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+                      NULL};
 // Barnett 2/1/17 onwards
 
-int main(int argc, char* argv[])
-{
-  BIGINT M, N1, N2;              // M = # srcs, N1,N2 = # modes
-  double w, tol = 1e-6;          // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N1, N2;           // M = # srcs, N1,N2 = # modes
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
   // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  int isign = +1;             // choose which exponential sign to test
-  if (argc<4 || argc>9) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 4 || argc > 9) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[2],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); M = (BIGINT)w;
-  if (argc>4) sscanf(argv[4],"%lf",&tol);
-  if (argc>5) sscanf(argv[5],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>6) sscanf(argv[6],"%d",&opts.spread_sort);
-  if (argc>7) { sscanf(argv[7],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>8) sscanf(argv[8],"%lf",&errfail);
-  
+  sscanf(argv[1], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[2], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 4) sscanf(argv[4], "%lf", &tol);
+  if (argc > 5) sscanf(argv[5], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 6) sscanf(argv[6], "%d", &opts.spread_sort);
+  if (argc > 7) {
+    sscanf(argv[7], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 8) sscanf(argv[8], "%lf", &errfail);
+
   cout << scientific << setprecision(15);
-  BIGINT N = N1*N2;
+  BIGINT N = N1 * N2;
 
-  FLT *x = (FLT *)malloc(sizeof(FLT)*M);        // NU pts x coords
-  FLT *y = (FLT *)malloc(sizeof(FLT)*M);        // NU pts y coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords
+  FLT *y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      y[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      y[j] = M_PI * randm11r(&se);
       c[j] = crandm11r(&se);
     }
   }
 
   printf("test 2d type 1:\n"); // -------------- type 1
-  CNTime timer; timer.start();
-  int ier = FINUFFT2D1(M,x,y,c,isign,tol,N1,N2,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT2D1(M, x, y, c, isign, tol, N1, N2, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
-	   (long long)M,(long long)N1,(long long)N2,ti,M/ti);
+    printf("\t%lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", (long long)M,
+           (long long)N1, (long long)N2, ti, M / ti);
 
-  BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2);  // choose some mode index to check
-  FLT Ftr=0, Fti=0;               // crude direct...
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {    // Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]))
-    FLT z=(FLT)isign*(nt1*x[j]+nt2*y[j]), co=cos(z), si=sin(z);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT nt1 = (BIGINT)(0.37 * N1), nt2 = (BIGINT)(0.26 * N2); // choose some mode index
+                                                               // to check
+  FLT Ftr = 0, Fti = 0;                                        // crude direct...
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) {            // Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]))
+    FLT z = (FLT)isign * (nt1 * x[j] + nt2 * y[j]), co = cos(z), si = sin(z);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  BIGINT it = N1/2+nt1 + N1*(N2/2+nt2);   // index in complex F as 1d array
-  err = abs(Ftr+IMA*Fti - F[it])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone mode: rel err in F[%lld,%lld] is %.3g\n",(long long)nt1,(long long)nt2,err);
-  if ((int64_t)M*N<=TEST_BIGPROB) {                   // also check vs full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft2d1(M,x,y,c,isign,N1,N2,Ft);
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft2d: rel l2-err of result F is %.3g\n",err);
+  BIGINT it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+  err       = abs(Ftr + IMA * Fti - F[it]) / infnorm(N, F);
+  errmax    = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld,%lld] is %.3g\n", (long long)nt1, (long long)nt2,
+         err);
+  if ((int64_t)M * N <= TEST_BIGPROB) { // also check vs full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft2d1(M, x, y, c, isign, N1, N2, Ft);
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft2d: rel l2-err of result F is %.3g\n", err);
     free(Ft);
   }
 
   printf("test 2d type 2:\n"); // -------------- type 2
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
   }
   timer.restart();
-  ier = FINUFFT2D2(M,x,y,c,isign,tol,N1,N2,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT2D2(M, x, y, c, isign, tol, N1, N2, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t(%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",(long long)N1,(long long)N2,(long long)M,ti,M/ti);
+    printf("\t(%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           (long long)N1, (long long)N2, (long long)M, ti, M / ti);
 
-  BIGINT jt = M/2;          // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0;
-  for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
-    for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1)
-      ct += F[m++] * exp(IMA*(FLT)isign*(m1*x[jt] + m2*y[jt])); // crude direct
-  err = abs(ct-c[jt])/infnorm(M,c);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err);
-  if ((int64_t)M*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* ct = (CPX*)malloc(sizeof(CPX)*M);
-    dirft2d2(M,x,y,ct,isign,N1,N2,F);
-    err = relerrtwonorm(M,ct,c);
-    errmax = max(err,errmax);
-    printf("\tdirft2d: rel l2-err of result c is %.3g\n",err);
-    //cout<<"c,ct:\n"; for (int j=0;j<M;++j) cout<<c[j]<<"\t"<<ct[j]<<endl;
+  BIGINT jt = M / 2; // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m  = 0;
+  for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+    for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+      ct += F[m++] * exp(IMA * (FLT)isign * (m1 * x[jt] + m2 * y[jt])); // crude
+                                                                        // direct
+  err    = abs(ct - c[jt]) / infnorm(M, c);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] is %.3g\n", (long long)jt, err);
+  if ((int64_t)M * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *ct = (CPX *)malloc(sizeof(CPX) * M);
+    dirft2d2(M, x, y, ct, isign, N1, N2, F);
+    err    = relerrtwonorm(M, ct, c);
+    errmax = max(err, errmax);
+    printf("\tdirft2d: rel l2-err of result c is %.3g\n", err);
+    // cout<<"c,ct:\n"; for (int j=0;j<M;++j) cout<<c[j]<<"\t"<<ct[j]<<endl;
     free(ct);
   }
 
   printf("test 2d type 3:\n"); // -------------- type 3
-  // reuse the strengths c, interpret N as number of targs:
+                               // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = 2.0 + M_PI*randm11r(&se);      // new x_j srcs, offset from origin
-      y[j] = -3.0 + M_PI*randm11r(&se);     // " y_j
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
     }
   }
-  FLT* s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-  FLT* t = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-  FLT S1 = (FLT)N1/2;                   // choose freq range sim to type 1
-  FLT S2 = (FLT)N2/2;
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+  FLT *t = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+  FLT S1 = (FLT)N1 / 2;                    // choose freq range sim to type 1
+  FLT S2 = (FLT)N2 / 2;
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) {
-      s[k] = S1*(1.7 + randm11r(&se));    //S*(1.7 + k/(FLT)N); // offset the freqs
-      t[k] = S2*(-0.5 + randm11r(&se));
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k) {
+      s[k] = S1 * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset the freqs
+      t[k] = S2 * (-0.5 + randm11r(&se));
     }
   }
   timer.restart();
-  ier = FINUFFT2D3(M,x,y,c,isign,tol,N,s,t,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT2D3(M, x, y, c, isign, tol, N, s, t, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n",(long long)M,(long long)N,ti,(M+N)/ti);
+    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n", (long long)M,
+           (long long)N, ti, (M + N) / ti);
 
-  BIGINT kt = N/2;          // check arbitrary choice of one targ pt
-  Ftr=0, Fti=0;                 // crude direct...
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {  // Ft += exp(IMA*(FLT)isign*(s[kt]*x[j] + t[kt]*y[j]))
-    FLT z=(FLT)isign*(s[kt]*x[j] + t[kt]*y[j]), co=cos(z), si=sin(z);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT kt = N / 2;                          // check arbitrary choice of one targ pt
+  Ftr = 0, Fti = 0;                           // crude direct...
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) {            // Ft += exp(IMA*(FLT)isign*(s[kt]*x[j] +
+                                              // t[kt]*y[j]))
+    FLT z = (FLT)isign * (s[kt] * x[j] + t[kt] * y[j]), co = cos(z), si = sin(z);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  err = abs(Ftr+IMA*Fti - F[kt])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] is %.3g\n",(long long)kt,err);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft2d3(M,x,y,c,isign,N,s,t,Ft);       // writes to F
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft2d: rel l2-err of result F is %.3g\n",err);
-    //cout<<"s t, F, Ft, F/Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<t[k]<<", "<<F[k]<<",\t"<<Ft[k]<<",\t"<<F[k]/Ft[k]<<endl;
+  err    = abs(Ftr + IMA * Fti - F[kt]) / infnorm(N, F);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] is %.3g\n", (long long)kt, err);
+  if (((int64_t)M) * N <= TEST_BIGPROB) {     // also full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft2d3(M, x, y, c, isign, N, s, t, Ft); // writes to F
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft2d: rel l2-err of result F is %.3g\n", err);
+    // cout<<"s t, F, Ft, F/Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<t[k]<<",
+    // "<<F[k]<<",\t"<<Ft[k]<<",\t"<<F[k]/Ft[k]<<endl;
     free(Ft);
   }
 
-  free(x); free(y); free(c); free(F); free(s); free(t);
-  return (errmax>errfail);
+  free(x);
+  free(y);
+  free(c);
+  free(F);
+  free(s);
+  free(t);
+  return (errmax > errfail);
 }
diff --git a/test/finufft2dmany_test.cpp b/test/finufft2dmany_test.cpp
index 31b65378e..fd26f919d 100644
--- a/test/finufft2dmany_test.cpp
+++ b/test/finufft2dmany_test.cpp
@@ -4,246 +4,263 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 2d, vectorized, all 3 types, either precision.",
-  "",
-  "Usage: finufft2dmany_test ntrans Nmodes1 Nmodes2 Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
-  "\teg:\tfinufft2dmany_test 100 1e2 1e2 1e5 1e-6 1 0 0 2 0.0 1e-5",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {
+    "Tester for FINUFFT in 2d, vectorized, all 3 types, either precision.",
+    "",
+    "Usage: finufft2dmany_test ntrans Nmodes1 Nmodes2 Nsrc [tol [debug [spread_thread "
+    "[maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
+    "\teg:\tfinufft2dmany_test 100 1e2 1e2 1e5 1e-6 1 0 0 2 0.0 1e-5",
+    "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+    NULL};
 // Melody Shih Jun 2018; Barnett removed many_seq 7/27/18. Extra args 5/21/20.
 
-int main(int argc, char* argv[])
-{
-  BIGINT M, N1, N2;              // M = # srcs, N1,N2 = # modes
-  int ntransf;                   // # of vectors for "many" interface
-  double w, tol = 1e-6;          // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N1, N2;           // M = # srcs, N1,N2 = # modes
+  int ntransf;                // # of vectors for "many" interface
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  //opts.fftw = FFTW_MEASURE;  // change from default FFTW_ESTIMATE
-  int isign = +1;                // choose which exponential sign to test
-  if (argc<5 || argc>12) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  // opts.fftw = FFTW_MEASURE;  // change from default FFTW_ESTIMATE
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 5 || argc > 12) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); ntransf = (int)w;
-  sscanf(argv[2],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[4],"%lf",&w); M = (BIGINT)w;
-  if (argc>5) sscanf(argv[5],"%lf",&tol);
-  if (argc>6) sscanf(argv[6],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>7) sscanf(argv[7],"%d",&opts.spread_thread);  
-  if (argc>8) sscanf(argv[8],"%d",&opts.maxbatchsize);  
-  if (argc>9) sscanf(argv[9],"%d",&opts.spread_sort);
-  if (argc>10) { sscanf(argv[10],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>11) sscanf(argv[11],"%lf",&errfail);
-  
+  sscanf(argv[1], "%lf", &w);
+  ntransf = (int)w;
+  sscanf(argv[2], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[4], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 5) sscanf(argv[5], "%lf", &tol);
+  if (argc > 6) sscanf(argv[6], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 7) sscanf(argv[7], "%d", &opts.spread_thread);
+  if (argc > 8) sscanf(argv[8], "%d", &opts.maxbatchsize);
+  if (argc > 9) sscanf(argv[9], "%d", &opts.spread_sort);
+  if (argc > 10) {
+    sscanf(argv[10], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 11) sscanf(argv[11], "%lf", &errfail);
+
   cout << scientific << setprecision(15);
-  BIGINT N = N1*N2;
+  BIGINT N = N1 * N2;
 
-  FLT* x = (FLT*)malloc(sizeof(FLT)*M);  // NU pts x coords
-  FLT* y = (FLT*)malloc(sizeof(FLT)*M);  // NU pts y coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M);           // NU pts x coords
+  FLT *y = (FLT *)malloc(sizeof(FLT) * M);           // NU pts y coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      y[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      y[j] = M_PI * randm11r(&se);
     }
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j = 0; j<ntransf*M; ++j)
-    {
-        c[j] = crandm11r(&se);
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < ntransf * M; ++j) {
+      c[j] = crandm11r(&se);
     }
   }
 
   printf("test 2d1 many vs repeated single: ------------------------------------\n");
-  CNTime timer; timer.start();
-  int ier = FINUFFT2D1MANY(ntransf,M,x,y,c,isign,tol,N1,N2,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT2D1MANY(ntransf, M, x, y, c, isign, tol, N1, N2, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,ti,ntransf*M/ti);
-  
-  int i = ntransf-1;    // choose a vector (transform number) to check
-  BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2);  // choose some mode index to check
-  CPX Ft = CPX(0,0), J = IMA*(FLT)isign;
-  for (BIGINT j=0; j<M; ++j)
-    Ft += c[j+i*M] * exp(J*(nt1*x[j]+nt2*y[j]));   // crude direct
-  BIGINT it = N1/2+nt1 + N1*(N2/2+nt2);   // index in complex F as 1d array
-  err = abs(Ft-F[it+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\tone mode: rel err in F[%lld,%lld] of trans#%d is %.3g\n",
-	 (long long)nt1,(long long)nt2,i,err);
+    printf("ntr=%d: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)M, (long long)N1, (long long)N2, ti, ntransf * M / ti);
+
+  int i      = ntransf - 1; // choose a vector (transform number) to check
+  BIGINT nt1 = (BIGINT)(0.37 * N1), nt2 = (BIGINT)(0.26 * N2); // choose some mode index
+                                                               // to check
+  CPX Ft = CPX(0, 0), J = IMA * (FLT)isign;
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j])); // crude direct
+  BIGINT it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2); // index in complex F as 1d array
+  err       = abs(Ft - F[it + i * N]) / infnorm(N, F + i * N);
+  errmax    = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld,%lld] of trans#%d is %.3g\n", (long long)nt1,
+         (long long)nt2, i, err);
 
   // compare the result with FINUFFT2D1
   FFTW_FORGET_WISDOM();
   finufft_opts simpleopts = opts;
-  simpleopts.debug = 0;       // don't output timing for calls of FINUFFT2D1
+  simpleopts.debug        = 0; // don't output timing for calls of FINUFFT2D1
   simpleopts.spread_debug = 0;
 
-  CPX* cstart;
-  CPX* Fstart;
-  CPX* F_2d1 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
+  CPX *cstart;
+  CPX *Fstart;
+  CPX *F_2d1 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for (int k= 0; k<ntransf; ++k)
-  {
-    cstart = c+k*M;
-    Fstart = F_2d1+k*N;
-    ier = FINUFFT2D1(M,x,y,cstart,isign,tol,N1,N2,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    cstart = c + k * M;
+    Fstart = F_2d1 + k * N;
+    ier    = FINUFFT2D1(M, x, y, cstart, isign, tol, N1, N2, Fstart, &simpleopts);
   }
-  double t=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  double t = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT2D1 / T_finufft2d1many = %.3g\n", t/ti);
+    printf("%d of: %lld NU pts to (%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)M, (long long)N1, (long long)N2, t, ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT2D1 / T_finufft2d1many = %.3g\n", t / ti);
 
   // Check consistency (worst over the ntransf)
   double maxerror = 0.0;
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,F_2d1+k*N,F+k*N));
-  errmax = max(maxerror,errmax);  
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2  ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, F_2d1 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2  ) =  %.3g\n", maxerror);
   free(F_2d1);
 
   printf("test 2d2 many vs repeated single: ------------------------------------\n");
-  
+
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N*ntransf; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N * ntransf; ++m) F[m] = crandm11r(&se);
   }
 
   FFTW_FORGET_WISDOM();
   timer.restart();
-  ier = FINUFFT2D2MANY(ntransf,M,x,y,c,isign,tol,N1,N2,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT2D2MANY(ntransf, M, x, y, c, isign, tol, N1, N2, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)M,ti,ntransf*M/ti);
+    printf("ntr=%d: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)N1, (long long)N2, (long long)M, ti, ntransf * M / ti);
 
   FFTW_FORGET_WISDOM();
-  i = ntransf-1;   // choose a data to check
-  BIGINT jt = M/2;    // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0;
-  for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2)  // loop in correct order over F
-    for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1)
-      ct += F[i*N + m++] * exp(J*(m1*x[jt] + m2*y[jt]));   // crude direct
-  err = abs(ct-c[jt+i*M])/infnorm(M,c+i*M);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err);
-  
+  i         = ntransf - 1; // choose a data to check
+  BIGINT jt = M / 2;       // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m  = 0;
+  for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) // loop in correct order over F
+    for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+      ct += F[i * N + m++] * exp(J * (m1 * x[jt] + m2 * y[jt])); // crude direct
+  err    = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err);
+
   // compare the result with single calls to FINUFFT2D2...
-  CPX* c_2d2 = (CPX*)malloc(sizeof(CPX)*M*ntransf);
+  CPX *c_2d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf);
   timer.restart();
-  for (int k=0; k<ntransf; ++k)
-  {
-    cstart = c_2d2+k*M;
-    Fstart = F+k*N;
-    ier = FINUFFT2D2(M,x,y,cstart,isign,tol,N1,N2,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    cstart = c_2d2 + k * M;
+    Fstart = F + k * N;
+    ier    = FINUFFT2D2(M, x, y, cstart, isign, tol, N1, N2, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)M,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT2D2 / T_finufft2d2many = %.3g\n", t/ti);
+    printf("%d of: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,
+           (long long)N1, (long long)N2, (long long)M, t, ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT2D2 / T_finufft2d2many = %.3g\n", t / ti);
 
-  maxerror = 0.0;           // worst error over the ntransf
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(M,c_2d2+k*M,c+k*M));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(M, c_2d2 + k * M, c + k * M));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n", maxerror);
   free(c_2d2);
 
   printf("test 2d3 many vs repeated single: ------------------------------------\n");
   FFTW_FORGET_WISDOM();
-  
+
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = 2.0 + M_PI*randm11r(&se);      // new x_j srcs, offset from origin
-      y[j] = -3.0 + M_PI*randm11r(&se);     // " y_j
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
     }
   }
-  
-  FLT* s_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-  FLT* t_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-  FLT S1 = (FLT)N1/2;                   // choose freq range sim to type 1
-  FLT S2 = (FLT)N2/2;
+
+  FLT *s_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+  FLT *t_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+  FLT S1      = (FLT)N1 / 2;                    // choose freq range sim to type 1
+  FLT S2      = (FLT)N2 / 2;
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) {
-      s_freq[k] = S1*(1.7 + randm11r(&se));    //S*(1.7 + k/(FLT)N); // offset the freqs
-      t_freq[k] = S2*(-0.5 + randm11r(&se));
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k) {
+      s_freq[k] = S1 * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset the
+                                              // freqs
+      t_freq[k] = S2 * (-0.5 + randm11r(&se));
     }
   }
 
   timer.restart();
-  ier = FINUFFT2D3MANY(ntransf,M,x,y,c,isign,tol,N,s_freq,t_freq,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT2D3MANY(ntransf, M, x, y, c, isign, tol, N, s_freq, t_freq, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU to %lld NU in %.3g s      \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,ti,ntransf*(M+N)/ti);
-
-  i = ntransf-1;            // choose a transform to check
-  BIGINT kt = N/4;          // check arbitrary choice of one targ pt
-  Ft = CPX(0,0);
-  for (BIGINT j=0;j<M;++j)
-    Ft += c[i*M + j] * exp(J*(s_freq[kt]*x[j] + t_freq[kt]*y[j]));
-  err = abs(Ft-F[kt+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
-
-// compare the result with FINUFFT2D3...
+    printf("ntr=%d: %lld NU to %lld NU in %.3g s      \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, ti, ntransf * (M + N) / ti);
+
+  i         = ntransf - 1; // choose a transform to check
+  BIGINT kt = N / 4;       // check arbitrary choice of one targ pt
+  Ft        = CPX(0, 0);
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[i * M + j] * exp(J * (s_freq[kt] * x[j] + t_freq[kt] * y[j]));
+  err    = abs(Ft - F[kt + i * N]) / infnorm(N, F + i * N);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
+
+  // compare the result with FINUFFT2D3...
   FFTW_FORGET_WISDOM();
-  CPX* f_2d3 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
+  CPX *f_2d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for (int k=0; k<ntransf; ++k)
-  {
-    Fstart = f_2d3+k*N;
-    cstart = c+k*M;
-    ier = FINUFFT2D3(M,x,y,cstart,isign,tol,N, s_freq,t_freq,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    Fstart = f_2d3 + k * N;
+    cstart = c + k * M;
+    ier = FINUFFT2D3(M, x, y, cstart, isign, tol, N, s_freq, t_freq, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t);
-  printf("\t\t\tspeedup \t T_FINUFFT2D3 / T_finufft2d3many = %.3g\n", t/ti);
+    printf("%d of: %lld NU to %lld NU in %.3g s       \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, t, ntransf * (M + N) / t);
+  printf("\t\t\tspeedup \t T_FINUFFT2D3 / T_finufft2d3many = %.3g\n", t / ti);
 
-  //check against the old
-  maxerror = 0.0;           // worst error over the ntransf
+  // check against the old
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,f_2d3+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, f_2d3 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n", maxerror);
   free(f_2d3);
-  
-  free(x); free(y); free(c); free(F); free(s_freq); free(t_freq);
-  return (errmax>errfail);
+
+  free(x);
+  free(y);
+  free(c);
+  free(F);
+  free(s_freq);
+  free(t_freq);
+  return (errmax > errfail);
 }
diff --git a/test/finufft3d_test.cpp b/test/finufft3d_test.cpp
index 29dba95d0..cdb748fb4 100644
--- a/test/finufft3d_test.cpp
+++ b/test/finufft3d_test.cpp
@@ -4,188 +4,208 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 3d, all 3 types, either precision.",
-  "",
-  "Usage: finufft3d_test Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug [spread_sort [upsampfac [errfail]]]]]",
-  "\teg:\tfinufft3d_test 100 200 50 1e6 1e-12 0 2 0.0 1e-11",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {"Tester for FINUFFT in 3d, all 3 types, either precision.",
+                      "",
+                      "Usage: finufft3d_test Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug "
+                      "[spread_sort [upsampfac [errfail]]]]]",
+                      "\teg:\tfinufft3d_test 100 200 50 1e6 1e-12 0 2 0.0 1e-11",
+                      "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+                      NULL};
 // Barnett 2/2/17 onwards.
 
-int main(int argc, char* argv[])
-{
+int main(int argc, char *argv[]) {
   BIGINT M, N1, N2, N3;       // M = # srcs, N1,N2,N3 = # modes
-  double w, tol = 1e-6;       // default
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  //opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  //opts.spread_max_sp_size = 3e4; // override test
-  //opts.spread_nthr_atomic = 15;  // "
-  int isign = +1;             // choose which exponential sign to test
-  if (argc<5 || argc>10) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
+  // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
+  // opts.spread_max_sp_size = 3e4; // override test
+  // opts.spread_nthr_atomic = 15;  // "
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 5 || argc > 10) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[2],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); N3 = (BIGINT)w;
-  sscanf(argv[4],"%lf",&w); M = (BIGINT)w;
-  if (argc>5) sscanf(argv[5],"%lf",&tol);
-  if (argc>6) sscanf(argv[6],"%d",&opts.debug);  // can be 0,1 or 2
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>7) sscanf(argv[7],"%d",&opts.spread_sort);
-  if (argc>8) { sscanf(argv[8],"%lf",&w); opts.upsampfac=(FLT)w; }
-  if (argc>9) sscanf(argv[9],"%lf",&errfail);
-  
+  sscanf(argv[1], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[2], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  N3 = (BIGINT)w;
+  sscanf(argv[4], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 5) sscanf(argv[5], "%lf", &tol);
+  if (argc > 6) sscanf(argv[6], "%d", &opts.debug); // can be 0,1 or 2
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0;     // see output from spreader
+  if (argc > 7) sscanf(argv[7], "%d", &opts.spread_sort);
+  if (argc > 8) {
+    sscanf(argv[8], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 9) sscanf(argv[9], "%lf", &errfail);
+
   cout << scientific << setprecision(15);
-  BIGINT N = N1*N2*N3;
+  BIGINT N = N1 * N2 * N3;
 
-  FLT *x = (FLT *)malloc(sizeof(FLT)*M);        // NU pts x coords
-  FLT *y = (FLT *)malloc(sizeof(FLT)*M);        // NU pts y coords
-  FLT *z = (FLT *)malloc(sizeof(FLT)*M);        // NU pts z coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M); // NU pts x coords
+  FLT *y = (FLT *)malloc(sizeof(FLT) * M); // NU pts y coords
+  FLT *z = (FLT *)malloc(sizeof(FLT) * M); // NU pts z coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N); // mode ampls
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();  // needed for parallel random #s
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      y[j] = M_PI*randm11r(&se);
-      z[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      y[j] = M_PI * randm11r(&se);
+      z[j] = M_PI * randm11r(&se);
       c[j] = crandm11r(&se);
     }
   }
 
   printf("test 3d type 1:\n"); // -------------- type 1
-  CNTime timer; timer.start();
-  int ier = FINUFFT3D1(M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT3D1(M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
     printf("     %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
-	   (long long)M,(long long)N1,(long long)N2,(long long)N3,ti,M/ti);
+           (long long)M, (long long)N1, (long long)N2, (long long)N3, ti, M / ti);
 
-  BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2), nt3 = (BIGINT)(-0.39*N3);  // choose mode to check
-  FLT Ftr=0, Fti=0;               // crude direct...
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {    // Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]+nt3*z[j]))
-    FLT w=(FLT)isign*(nt1*x[j]+nt2*y[j]+nt3*z[j]), co=cos(w), si=sin(w);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT nt1 = (BIGINT)(0.37 * N1), nt2 = (BIGINT)(0.26 * N2),
+         nt3 = (BIGINT)(-0.39 * N3); // choose mode to check
+  FLT Ftr = 0, Fti = 0;              // crude direct...
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) {   // Ft += c[j] * exp(J*(nt1*x[j]+nt2*y[j]+nt3*z[j]))
+    FLT w = (FLT)isign * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j]), co = cos(w), si = sin(w);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
   // index in complex F as 1d array...
-  BIGINT it = N1/2+nt1 + N1*(N2/2+nt2) + N1*N2*(N3/2+nt3);
-  err = abs(Ftr+IMA*Fti - F[it])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone mode: rel err in F[%lld,%lld,%lld] is %.3g\n",(long long)nt1,(long long)nt2,(long long)nt3,err);
-  if ((int64_t)M*N<=TEST_BIGPROB) {                   // also check vs full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft3d1(M,x,y,z,c,isign,N1,N2,N3,Ft);
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft3d: rel l2-err of result F is %.3g\n",err);
+  BIGINT it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3);
+  err       = abs(Ftr + IMA * Fti - F[it]) / infnorm(N, F);
+  errmax    = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld,%lld,%lld] is %.3g\n", (long long)nt1,
+         (long long)nt2, (long long)nt3, err);
+  if ((int64_t)M * N <= TEST_BIGPROB) { // also check vs full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft3d1(M, x, y, z, c, isign, N1, N2, N3, Ft);
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft3d: rel l2-err of result F is %.3g\n", err);
     free(Ft);
   }
-  
+
   printf("test 3d type 2:\n"); // -------------- type 2
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N; ++m) F[m] = crandm11r(&se);
   }
   timer.restart();
-  ier = FINUFFT3D2(M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT3D2(M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
     printf("     (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
-	   (long long)N1,(long long)N2,(long long)N3,(long long)M,ti,M/ti);
+           (long long)N1, (long long)N2, (long long)N3, (long long)M, ti, M / ti);
 
-  BIGINT jt = M/2;          // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0;
-  for (BIGINT m3=-(N3/2); m3<=(N3-1)/2; ++m3)   // loop in F order
-    for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2)
-      for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1)
-	ct += F[m++] * exp(IMA*(FLT)isign*(m1*x[jt] + m2*y[jt] + m3*z[jt]));
-  err = abs(ct-c[jt])/infnorm(M,c);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] is %.3g\n",(long long)jt,err);
-  if ((int64_t)M*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* ct = (CPX*)malloc(sizeof(CPX)*M);
-    dirft3d2(M,x,y,z,ct,isign,N1,N2,N3,F);
-    err = relerrtwonorm(M,ct,c);
-    errmax = max(err,errmax);
-    printf("\tdirft3d: rel l2-err of result c is %.3g\n",err);
+  BIGINT jt = M / 2; // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m  = 0;
+  for (BIGINT m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3) // loop in F order
+    for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2)
+      for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1)
+        ct += F[m++] * exp(IMA * (FLT)isign * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt]));
+  err    = abs(ct - c[jt]) / infnorm(M, c);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] is %.3g\n", (long long)jt, err);
+  if ((int64_t)M * N <= TEST_BIGPROB) { // also full direct eval
+    CPX *ct = (CPX *)malloc(sizeof(CPX) * M);
+    dirft3d2(M, x, y, z, ct, isign, N1, N2, N3, F);
+    err    = relerrtwonorm(M, ct, c);
+    errmax = max(err, errmax);
+    printf("\tdirft3d: rel l2-err of result c is %.3g\n", err);
     free(ct);
   }
 
   printf("test 3d type 3:\n"); // -------------- type 3
-  // reuse the strengths c, interpret N as number of targs:
+                               // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = 2.0 + M_PI*randm11r(&se);      // new x_j srcs, offset from origin
-      y[j] = -3.0 + M_PI*randm11r(&se);     // " y_j
-      z[j] = 1.0 + M_PI*randm11r(&se);      // " z_j
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
+      z[j] = 1.0 + M_PI * randm11r(&se);  // " z_j
     }
   }
-  FLT* s = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-  FLT* t = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-  FLT* u = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (3-cmpt)
-  FLT S1 = (FLT)N1/2;                   // choose freq range sim to type 1
-  FLT S2 = (FLT)N2/2;
-  FLT S3 = (FLT)N3/2;
+  FLT *s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+  FLT *t = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+  FLT *u = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (3-cmpt)
+  FLT S1 = (FLT)N1 / 2;                    // choose freq range sim to type 1
+  FLT S2 = (FLT)N2 / 2;
+  FLT S3 = (FLT)N3 / 2;
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) {
-      s[k] = S1*(1.7 + randm11r(&se));  //S*(1.7 + k/(FLT)N); // offset the freqs
-      t[k] = S2*(-0.5 + randm11r(&se));
-      u[k] = S3*(0.9 + randm11r(&se));
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k) {
+      s[k] = S1 * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset the freqs
+      t[k] = S2 * (-0.5 + randm11r(&se));
+      u[k] = S3 * (0.9 + randm11r(&se));
     }
   }
   timer.restart();
-  ier = FINUFFT3D3(M,x,y,z,c,isign,tol,N,s,t,u,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT3D3(M, x, y, z, c, isign, tol, N, s, t, u, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n",(long long)M,(long long)N,ti,(M+N)/ti);
+    printf("\t%lld NU to %lld NU in %.3g s         \t%.3g tot NU pts/s\n", (long long)M,
+           (long long)N, ti, (M + N) / ti);
 
-  BIGINT kt = N/2;          // check arbitrary choice of one targ pt
-  Ftr=0, Fti=0;                 // crude direct...
-#pragma omp parallel for schedule(static,TEST_RANDCHUNK) reduction(+:Ftr,Fti)
-  for (BIGINT j=0; j<M; ++j) {  // Ft += c[j] * exp(IMA*(FLT)isign*(s[kt]*x[j] + t[kt]*y[j] + u[kt]*z[j]))
-    FLT w=(FLT)isign*(s[kt]*x[j]+t[kt]*y[j]+u[kt]*z[j]), co=cos(w), si=sin(w);
-    Ftr += real(c[j])*co - imag(c[j])*si;  // cpx arith by hand
-    Fti += imag(c[j])*co + real(c[j])*si;
+  BIGINT kt = N / 2;               // check arbitrary choice of one targ pt
+  Ftr = 0, Fti = 0;                // crude direct...
+#pragma omp parallel for schedule(static, TEST_RANDCHUNK) reduction(+ : Ftr, Fti)
+  for (BIGINT j = 0; j < M; ++j) { // Ft += c[j] * exp(IMA*(FLT)isign*(s[kt]*x[j] +
+                                   // t[kt]*y[j] + u[kt]*z[j]))
+    FLT w = (FLT)isign * (s[kt] * x[j] + t[kt] * y[j] + u[kt] * z[j]), co = cos(w),
+        si = sin(w);
+    Ftr += real(c[j]) * co - imag(c[j]) * si; // cpx arith by hand
+    Fti += imag(c[j]) * co + real(c[j]) * si;
   }
-  err = abs(Ftr+IMA*Fti - F[kt])/infnorm(N,F);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in F[%lld] is %.3g\n",(long long)kt,err);
-  if (((int64_t)M)*N<=TEST_BIGPROB) {                  // also full direct eval
-    CPX* Ft = (CPX*)malloc(sizeof(CPX)*N);
-    dirft3d3(M,x,y,z,c,isign,N,s,t,u,Ft);       // writes to F
-    err = relerrtwonorm(N,Ft,F);
-    errmax = max(err,errmax);
-    printf("\tdirft3d: rel l2-err of result F is %.3g\n",err);
-    //cout<<"s t u, F, Ft, F/Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<t[k]<<" "<<u[k]<<", "<<F[k]<<",\t"<<Ft[k]<<",\t"<<F[k]/Ft[k]<<endl;
+  err    = abs(Ftr + IMA * Fti - F[kt]) / infnorm(N, F);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in F[%lld] is %.3g\n", (long long)kt, err);
+  if (((int64_t)M) * N <= TEST_BIGPROB) {           // also full direct eval
+    CPX *Ft = (CPX *)malloc(sizeof(CPX) * N);
+    dirft3d3(M, x, y, z, c, isign, N, s, t, u, Ft); // writes to F
+    err    = relerrtwonorm(N, Ft, F);
+    errmax = max(err, errmax);
+    printf("\tdirft3d: rel l2-err of result F is %.3g\n", err);
+    // cout<<"s t u, F, Ft, F/Ft:\n"; for (int k=0;k<N;++k) cout<<s[k]<<" "<<t[k]<<"
+    // "<<u[k]<<", "<<F[k]<<",\t"<<Ft[k]<<",\t"<<F[k]/Ft[k]<<endl;
     free(Ft);
   }
 
-  free(x); free(y); free(z); free(c); free(F); free(s); free(t); free(u);
-  return (errmax>errfail);
+  free(x);
+  free(y);
+  free(z);
+  free(c);
+  free(F);
+  free(s);
+  free(t);
+  free(u);
+  return (errmax > errfail);
 }
diff --git a/test/finufft3dmany_test.cpp b/test/finufft3dmany_test.cpp
index d427555c3..485f90b06 100644
--- a/test/finufft3dmany_test.cpp
+++ b/test/finufft3dmany_test.cpp
@@ -4,254 +4,285 @@
 using namespace std;
 using namespace finufft::utils;
 
-const char* help[]={
-  "Tester for FINUFFT in 3d, vectorized, all 3 types, either precision.",
-  "",
-  "Usage: finufft3dmany_test ntrans Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug [spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
-  "\teg:\tfinufft3dmany_test 100 50 50 50 1e5 1e-3 1 0 0 2 0.0 1e-2",
-  "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
-  NULL};
+const char *help[] = {
+    "Tester for FINUFFT in 3d, vectorized, all 3 types, either precision.",
+    "",
+    "Usage: finufft3dmany_test ntrans Nmodes1 Nmodes2 Nmodes3 Nsrc [tol [debug "
+    "[spread_thread [maxbatchsize [spreadsort [upsampfac [errfail]]]]]]]",
+    "\teg:\tfinufft3dmany_test 100 50 50 50 1e5 1e-3 1 0 0 2 0.0 1e-2",
+    "\tnotes:\tif errfail present, exit code 1 if any error > errfail",
+    NULL};
 // Malleo 2019 based on Shih 2018. Tidied, extra args, Barnett 5/25/20.
 
-int main(int argc, char* argv[])
-{
-  BIGINT M, N1, N2, N3;          // M = # srcs, N1,N2 = # modes
-  int ntransf;                   // # of vectors for "many" interface
-  double w, tol = 1e-6;          // default
+int main(int argc, char *argv[]) {
+  BIGINT M, N1, N2, N3;       // M = # srcs, N1,N2 = # modes
+  int ntransf;                // # of vectors for "many" interface
+  double w, tol       = 1e-6; // default
   double err, errfail = INFINITY, errmax = 0;
-  finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
+  finufft_opts opts;
+  FINUFFT_DEFAULT_OPTS(&opts);
   // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
-  int isign = +1;             // choose which exponential sign to test
-  if (argc<6 || argc>13) {
-    for (int i=0; help[i]; ++i)
-      fprintf(stderr,"%s\n",help[i]);
+  int isign = +1; // choose which exponential sign to test
+  if (argc < 6 || argc > 13) {
+    for (int i = 0; help[i]; ++i) fprintf(stderr, "%s\n", help[i]);
     return 2;
   }
-  sscanf(argv[1],"%lf",&w); ntransf = (int)w;
-  sscanf(argv[2],"%lf",&w); N1 = (BIGINT)w;
-  sscanf(argv[3],"%lf",&w); N2 = (BIGINT)w;
-  sscanf(argv[4],"%lf",&w); N3 = (BIGINT)w;
-  sscanf(argv[5],"%lf",&w); M = (BIGINT)w;
-  if (argc>6) sscanf(argv[6],"%lf",&tol);
-  if (argc>7) sscanf(argv[7],"%d",&opts.debug);
-  opts.spread_debug = (opts.debug>1) ? 1 : 0;  // see output from spreader
-  if (argc>8) sscanf(argv[8],"%d",&opts.spread_thread);  
-  if (argc>9) sscanf(argv[9],"%d",&opts.maxbatchsize);  
-  if (argc>10) sscanf(argv[10],"%d",&opts.spread_sort);
-  if (argc>11) { sscanf(argv[11],"%lf",&w); opts.upsampfac = (FLT)w; }
-  if (argc>12) sscanf(argv[12],"%lf",&errfail);
+  sscanf(argv[1], "%lf", &w);
+  ntransf = (int)w;
+  sscanf(argv[2], "%lf", &w);
+  N1 = (BIGINT)w;
+  sscanf(argv[3], "%lf", &w);
+  N2 = (BIGINT)w;
+  sscanf(argv[4], "%lf", &w);
+  N3 = (BIGINT)w;
+  sscanf(argv[5], "%lf", &w);
+  M = (BIGINT)w;
+  if (argc > 6) sscanf(argv[6], "%lf", &tol);
+  if (argc > 7) sscanf(argv[7], "%d", &opts.debug);
+  opts.spread_debug = (opts.debug > 1) ? 1 : 0; // see output from spreader
+  if (argc > 8) sscanf(argv[8], "%d", &opts.spread_thread);
+  if (argc > 9) sscanf(argv[9], "%d", &opts.maxbatchsize);
+  if (argc > 10) sscanf(argv[10], "%d", &opts.spread_sort);
+  if (argc > 11) {
+    sscanf(argv[11], "%lf", &w);
+    opts.upsampfac = (FLT)w;
+  }
+  if (argc > 12) sscanf(argv[12], "%lf", &errfail);
 
   cout << scientific << setprecision(15);
-  BIGINT N = N1*N2*N3;
+  BIGINT N = N1 * N2 * N3;
 
-  FLT* x = (FLT*)malloc(sizeof(FLT)*M);  // NU pts x coords
-  FLT* y = (FLT*)malloc(sizeof(FLT)*M);  // NU pts y coords
-  FLT* z = (FLT*)malloc(sizeof(FLT)*M);  // NU pts z coords
-  CPX* c = (CPX*)malloc(sizeof(CPX)*M*ntransf);   // strengths 
-  CPX* F = (CPX*)malloc(sizeof(CPX)*N*ntransf);   // mode ampls
+  FLT *x = (FLT *)malloc(sizeof(FLT) * M);           // NU pts x coords
+  FLT *y = (FLT *)malloc(sizeof(FLT) * M);           // NU pts y coords
+  FLT *z = (FLT *)malloc(sizeof(FLT) * M);           // NU pts z coords
+  CPX *c = (CPX *)malloc(sizeof(CPX) * M * ntransf); // strengths
+  CPX *F = (CPX *)malloc(sizeof(CPX) * N * ntransf); // mode ampls
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = M_PI*randm11r(&se);
-      y[j] = M_PI*randm11r(&se);
-      z[j] = M_PI*randm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = M_PI * randm11r(&se);
+      y[j] = M_PI * randm11r(&se);
+      z[j] = M_PI * randm11r(&se);
     }
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j = 0; j<ntransf*M; ++j)
-    {
-        c[j] = crandm11r(&se);
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < ntransf * M; ++j) {
+      c[j] = crandm11r(&se);
     }
   }
 
-
   printf("test 3d1 many vs repeated single: ------------------------------------\n");
-  CNTime timer; timer.start();
-  int ier = FINUFFT3D1MANY(ntransf,M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
-  double ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  CNTime timer;
+  timer.start();
+  int ier   = FINUFFT3D1MANY(ntransf, M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
+  double ti = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2, (long long)N3, ti,ntransf*M/ti);
+    printf("ntr=%d: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)M, (long long)N1, (long long)N2, (long long)N3, ti,
+           ntransf * M / ti);
 
-  int i = ntransf-1;    // choose a data to check
-  BIGINT nt1 = (BIGINT)(0.37*N1), nt2 = (BIGINT)(0.26*N2), nt3 = (BIGINT)(-0.39*N3);  // choose some mode index to check
-  CPX Ft = CPX(0,0), J = IMA*(FLT)isign;
-  for (BIGINT j=0; j<M; ++j)
-    Ft += c[j+i*M] * exp(J*(nt1*x[j]+nt2*y[j]+nt3*z[j]));   // crude direct
-  BIGINT it = N1/2+nt1 + N1*(N2/2+nt2) + N1*N2*(N3/2+nt3);   // index in complex F as 1d array
-  err = abs(Ft-F[it+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-    printf("\tone mode: rel err in F[%lld,%lld,%lld] of trans#%d is %.3g\n",
-	 (long long)nt1,(long long)nt2,(long long)nt3,i,err);
+  int i      = ntransf - 1;          // choose a data to check
+  BIGINT nt1 = (BIGINT)(0.37 * N1), nt2 = (BIGINT)(0.26 * N2),
+         nt3 = (BIGINT)(-0.39 * N3); // choose some mode index to check
+  CPX Ft = CPX(0, 0), J = IMA * (FLT)isign;
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[j + i * M] * exp(J * (nt1 * x[j] + nt2 * y[j] + nt3 * z[j]));    // crude
+                                                                             // direct
+  BIGINT it = N1 / 2 + nt1 + N1 * (N2 / 2 + nt2) + N1 * N2 * (N3 / 2 + nt3); // index in
+                                                                             // complex
+                                                                             // F as 1d
+                                                                             // array
+  err    = abs(Ft - F[it + i * N]) / infnorm(N, F + i * N);
+  errmax = max(err, errmax);
+  printf("\tone mode: rel err in F[%lld,%lld,%lld] of trans#%d is %.3g\n", (long long)nt1,
+         (long long)nt2, (long long)nt3, i, err);
 
   // compare the result with FINUFFT3D1
   FFTW_FORGET_WISDOM();
-  finufft_opts simpleopts=opts;
-  simpleopts.debug = 0;       // don't output timing for calls of FINUFFT3D1
+  finufft_opts simpleopts = opts;
+  simpleopts.debug        = 0; // don't output timing for calls of FINUFFT3D1
   simpleopts.spread_debug = 0;
 
-  CPX* cstart;
-  CPX* Fstart;
-  CPX* F_3d1 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
+  CPX *cstart;
+  CPX *Fstart;
+  CPX *F_3d1 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for (int k= 0; k<ntransf; ++k)
-  {
-    cstart = c+k*M;
-    Fstart = F_3d1+k*N;
-    ier = FINUFFT3D1(M,x,y,z,cstart,isign,tol,N1,N2,N3,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    cstart = c + k * M;
+    Fstart = F_3d1 + k * N;
+    ier    = FINUFFT3D1(M, x, y, z, cstart, isign, tol, N1, N2, N3, Fstart, &simpleopts);
   }
-  double t=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  double t = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s  \t%.3g NU pts/s\n", ntransf,(long long)M,(long long)N1,(long long)N2,(long long)N3,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT3D1 / T_finufft3d1many = %.3g\n", t/ti);
+    printf("%d of: %lld NU pts to (%lld,%lld,%lld) modes in %.3g s  \t%.3g NU pts/s\n",
+           ntransf, (long long)M, (long long)N1, (long long)N2, (long long)N3, t,
+           ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT3D1 / T_finufft3d1many = %.3g\n", t / ti);
 
   // Check accuracy (worst over the ntransf)
   double maxerror = 0.0;
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,F_3d1+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, F_3d1 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n", maxerror);
   free(F_3d1);
 
-
   printf("test 3d2 many vs repeated single: ------------------------------------\n");
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT m=0; m<N*ntransf; ++m) F[m] = crandm11r(&se);
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT m = 0; m < N * ntransf; ++m) F[m] = crandm11r(&se);
   }
   FFTW_FORGET_WISDOM();
   timer.restart();
-  ier = FINUFFT3D2MANY(ntransf,M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT3D2MANY(ntransf, M, x, y, z, c, isign, tol, N1, N2, N3, F, &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2, (long long)N3, (long long)M,ti,ntransf*M/ti);
-  
-  i = ntransf-1;      // choose a data to check
-  BIGINT jt = M/2;    // check arbitrary choice of one targ pt
-  CPX ct = CPX(0,0);
-  BIGINT m=0;
-  for(BIGINT m3=-(N3/2); m3<=(N3-1)/2; ++m3){
-    for (BIGINT m2=-(N2/2); m2<=(N2-1)/2; ++m2){  // loop in correct order over F
-      for (BIGINT m1=-(N1/2); m1<=(N1-1)/2; ++m1){
-	ct += F[i*N + m++] * exp(J*(m1*x[jt]+m2*y[jt]+m3*z[jt]));   // crude direct
+    printf("ntr=%d: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)N1, (long long)N2, (long long)N3, (long long)M, ti,
+           ntransf * M / ti);
+
+  i         = ntransf - 1; // choose a data to check
+  BIGINT jt = M / 2;       // check arbitrary choice of one targ pt
+  CPX ct    = CPX(0, 0);
+  BIGINT m  = 0;
+  for (BIGINT m3 = -(N3 / 2); m3 <= (N3 - 1) / 2; ++m3) {
+    for (BIGINT m2 = -(N2 / 2); m2 <= (N2 - 1) / 2; ++m2) { // loop in correct order
+                                                            // over F
+      for (BIGINT m1 = -(N1 / 2); m1 <= (N1 - 1) / 2; ++m1) {
+        ct += F[i * N + m++] * exp(J * (m1 * x[jt] + m2 * y[jt] + m3 * z[jt])); // crude
+                                                                                // direct
       }
     }
   }
-  err = abs(ct-c[jt+i*M])/infnorm(M,c+i*M);
-  errmax = max(err,errmax);
-  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err);
+  err    = abs(ct - c[jt + i * M]) / infnorm(M, c + i * M);
+  errmax = max(err, errmax);
+  printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n", (long long)jt, i, err);
 
   FFTW_FORGET_WISDOM();
   // compare the result with FINUFFT3D2...
-  CPX* c_3d2 = (CPX*)malloc(sizeof(CPX)*M*ntransf);
+  CPX *c_3d2 = (CPX *)malloc(sizeof(CPX) * M * ntransf);
   timer.restart();
-  for (int k=0; k<ntransf; ++k)
-  {
-    cstart = c_3d2+k*M;
-    Fstart = F+k*N;
-    ier = FINUFFT3D2(M,x,y,z,cstart,isign,tol,N1,N2,N3,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    cstart = c_3d2 + k * M;
+    Fstart = F + k * N;
+    ier    = FINUFFT3D2(M, x, y, z, cstart, isign, tol, N1, N2, N3, Fstart, &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)N3,(long long)M,t,ntransf*M/t);
-  printf("\t\t\tspeedup \t T_FINUFFT3D2 / T_finufft3d2many = %.3g\n", t/ti);
+    printf("%d of: (%lld,%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n",
+           ntransf, (long long)N1, (long long)N2, (long long)N3, (long long)M, t,
+           ntransf * M / t);
+  printf("\t\t\tspeedup \t T_FINUFFT3D2 / T_finufft3d2many = %.3g\n", t / ti);
 
-  maxerror = 0.0;           // worst error over the ntransf
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(M,c_3d2+k*M,c+k*M));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(M, c_3d2 + k * M, c + k * M));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||c_many-c||_2 / ||c||_2 ) =  %.3g\n", maxerror);
   free(c_3d2);
 
-
   printf("test 3d3 many vs repeated single: ------------------------------------\n");
   FFTW_FORGET_WISDOM();
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT j=0; j<M; ++j) {
-      x[j] = 2.0 + M_PI*randm11r(&se);      // new x_j srcs, offset from origin
-      y[j] = -3.0 + M_PI*randm11r(&se);     // " y_j
-      z[j] = 1.0 + M_PI*randm11r(&se);      // " z_j
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT j = 0; j < M; ++j) {
+      x[j] = 2.0 + M_PI * randm11r(&se);  // new x_j srcs, offset from origin
+      y[j] = -3.0 + M_PI * randm11r(&se); // " y_j
+      z[j] = 1.0 + M_PI * randm11r(&se);  // " z_j
     }
-  }  
-  FLT* s_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (1-cmpt)
-  FLT* t_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (2-cmpt)
-  FLT* u_freq = (FLT*)malloc(sizeof(FLT)*N);    // targ freqs (3-cmpt)
-  FLT S1 = (FLT)N1/2;                   // choose freq range sim to type 1
-  FLT S2 = (FLT)N2/2;
-  FLT S3 = (FLT)N3/2;
+  }
+  FLT *s_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt)
+  FLT *t_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (2-cmpt)
+  FLT *u_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (3-cmpt)
+  FLT S1      = (FLT)N1 / 2;                    // choose freq range sim to type 1
+  FLT S2      = (FLT)N2 / 2;
+  FLT S3      = (FLT)N3 / 2;
 
 #pragma omp parallel
   {
-    unsigned int se=MY_OMP_GET_THREAD_NUM();
-#pragma omp for schedule(static,TEST_RANDCHUNK)
-    for (BIGINT k=0; k<N; ++k) {
-      s_freq[k] = S1*(1.7 + randm11r(&se));    //S*(1.7 + k/(FLT)N); // offset the freqs
-      t_freq[k] = S2*(-0.5 + randm11r(&se));
-      u_freq[k] = S3*(0.9 + randm11r(&se));
+    unsigned int se = MY_OMP_GET_THREAD_NUM();
+#pragma omp for schedule(static, TEST_RANDCHUNK)
+    for (BIGINT k = 0; k < N; ++k) {
+      s_freq[k] = S1 * (1.7 + randm11r(&se)); // S*(1.7 + k/(FLT)N); // offset the
+                                              // freqs
+      t_freq[k] = S2 * (-0.5 + randm11r(&se));
+      u_freq[k] = S3 * (0.9 + randm11r(&se));
     }
   }
 
   timer.restart();
-  ier = FINUFFT3D3MANY(ntransf,M,x,y,z,c,isign,tol,N,s_freq,t_freq,u_freq,F,&opts);
-  ti=timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  ier = FINUFFT3D3MANY(ntransf, M, x, y, z, c, isign, tol, N, s_freq, t_freq, u_freq, F,
+                       &opts);
+  ti  = timer.elapsedsec();
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("ntr=%d: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,ti,ntransf*(M+N)/ti);
+    printf("ntr=%d: %lld NU to %lld NU in %.3g s \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, ti, ntransf * (M + N) / ti);
 
-  i = ntransf-1;           // choose a transform to check
-  BIGINT kt = N/4;          // check arbitrary choice of one targ pt
-  Ft = CPX(0,0);
-  for (BIGINT j=0;j<M;++j)
-    Ft += c[i*M + j] * exp(J*(s_freq[kt]*x[j] + t_freq[kt]*y[j]+ u_freq[kt]*z[j]));
-  err = abs(Ft-F[kt+i*N])/infnorm(N,F+i*N);
-  errmax = max(err,errmax);
-  printf("\t one targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
+  i         = ntransf - 1; // choose a transform to check
+  BIGINT kt = N / 4;       // check arbitrary choice of one targ pt
+  Ft        = CPX(0, 0);
+  for (BIGINT j = 0; j < M; ++j)
+    Ft += c[i * M + j] *
+          exp(J * (s_freq[kt] * x[j] + t_freq[kt] * y[j] + u_freq[kt] * z[j]));
+  err    = abs(Ft - F[kt + i * N]) / infnorm(N, F + i * N);
+  errmax = max(err, errmax);
+  printf("\t one targ: rel err in F[%lld] of trans#%d is %.3g\n", (long long)kt, i, err);
 
   FFTW_FORGET_WISDOM();
-// compare the result with FINUFFT3D3...
-  CPX* f_3d3 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
+  // compare the result with FINUFFT3D3...
+  CPX *f_3d3 = (CPX *)malloc(sizeof(CPX) * N * ntransf);
   timer.restart();
-  for (int k=0; k<ntransf; ++k) {
-    Fstart = f_3d3+k*N;
-    cstart = c+k*M;
-    ier = FINUFFT3D3(M,x,y,z,cstart,isign,tol,N, s_freq,t_freq,u_freq,Fstart,&simpleopts);
+  for (int k = 0; k < ntransf; ++k) {
+    Fstart = f_3d3 + k * N;
+    cstart = c + k * M;
+    ier    = FINUFFT3D3(M, x, y, z, cstart, isign, tol, N, s_freq, t_freq, u_freq, Fstart,
+                        &simpleopts);
   }
   t = timer.elapsedsec();
-  if (ier>1) {
-    printf("error (ier=%d)!\n",ier);
+  if (ier > 1) {
+    printf("error (ier=%d)!\n", ier);
     return ier;
   } else
-    printf("%d of: %lld NU to %lld NU in %.3g s   \t%.3g tot NU pts/s\n",ntransf, (long long)M,(long long)N,t,ntransf*(M+N)/t);
-  printf("\t\t\tspeedup \t T_FINUFFT3D3 / T_finufft3d3many = %.3g\n", t/ti);
-  
-  maxerror = 0.0;           // worst error over the ntransf
+    printf("%d of: %lld NU to %lld NU in %.3g s   \t%.3g tot NU pts/s\n", ntransf,
+           (long long)M, (long long)N, t, ntransf * (M + N) / t);
+  printf("\t\t\tspeedup \t T_FINUFFT3D3 / T_finufft3d3many = %.3g\n", t / ti);
+
+  maxerror = 0.0; // worst error over the ntransf
   for (int k = 0; k < ntransf; ++k)
-    maxerror = max(maxerror, (double)relerrtwonorm(N,f_3d3+k*N,F+k*N));
-  errmax = max(maxerror,errmax);
-  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n",maxerror);
+    maxerror = max(maxerror, (double)relerrtwonorm(N, f_3d3 + k * N, F + k * N));
+  errmax = max(maxerror, errmax);
+  printf("\tconsistency check: sup ( ||f_many-f||_2 / ||f||_2 ) =  %.3g\n", maxerror);
   free(f_3d3);
-  
-  free(x); free(y); free(z); free(c); free(F); free(s_freq); free(t_freq); free(u_freq);
-  return (errmax>errfail);
+
+  free(x);
+  free(y);
+  free(z);
+  free(c);
+  free(F);
+  free(s_freq);
+  free(t_freq);
+  free(u_freq);
+  return (errmax > errfail);
 }
diff --git a/test/testutils.cpp b/test/testutils.cpp
index cd2cd7bef..64b5d7a0a 100644
--- a/test/testutils.cpp
+++ b/test/testutils.cpp
@@ -9,16 +9,16 @@
    and platform-indep, than having to compare the text output)
 
    Suggested compile (double/float versions):
-   g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o ../src/utils_precindep.o -o testutils -lgomp
-   g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE
+   g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o
+   ../src/utils_precindep.o -o testutils -lgomp g++ -std=c++14 -fopenmp testutils.cpp
+   -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE
 */
 
 // This switches FLT macro from double to float if SINGLE is defined, etc...
 #include <finufft/test_defs.h>
 using namespace finufft::utils;
 
-int main(int argc, char* argv[])
-{
+int main(int argc, char *argv[]) {
 #ifdef SINGLE
   printf("testutilsf started...\n");
 #else
@@ -28,35 +28,41 @@ int main(int argc, char* argv[])
   // test next235even...
   // Barnett 2/9/17, made smaller range 3/28/17. pass-fail 6/16/23
   // The true outputs from {0,1,..,99}:
-  const BIGINT next235even_true[100] = {2, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 16, 16, 16, 16, 18, 18, 20, 20, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 32, 32, 36, 36, 36, 36, 40, 40, 40, 40, 48, 48, 48, 48, 48, 48, 48, 48, 50, 50, 54, 54, 54, 54, 60, 60, 60, 60, 60, 60, 64, 64, 64, 64, 72, 72, 72, 72, 72, 72, 72, 72, 80, 80, 80, 80, 80, 80, 80, 80, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 96, 96, 96, 96, 96, 96, 100, 100, 100};
-  for (BIGINT n=0;n<100;++n) {
+  const BIGINT next235even_true[100] = {
+      2,  2,  2,  4,  4,  6,  6,  8,  8,  10, 10, 12, 12, 16, 16, 16, 16, 18,  18,  20,
+      20, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 32, 32, 36, 36, 36, 36, 40,  40,  40,
+      40, 48, 48, 48, 48, 48, 48, 48, 48, 50, 50, 54, 54, 54, 54, 60, 60, 60,  60,  60,
+      60, 64, 64, 64, 64, 72, 72, 72, 72, 72, 72, 72, 72, 80, 80, 80, 80, 80,  80,  80,
+      80, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 96, 96, 96, 96, 96, 96, 100, 100, 100};
+  for (BIGINT n = 0; n < 100; ++n) {
     BIGINT o = next235even(n);
     BIGINT t = next235even_true[n];
-    if (o!=t) {
-      printf("next235even(%lld) =\t%lld, error should be %lld!\n",(long long)n, (long long)o, (long long)t);
+    if (o != t) {
+      printf("next235even(%lld) =\t%lld, error should be %lld!\n", (long long)n,
+             (long long)o, (long long)t);
       return 1;
     }
   }
-  
+
   // various old devel expts and comments...
-  //printf("starting huge next235even...\n");   // 1e11 takes 1 sec
-  //BIGINT n=(BIGINT)120573851963;
-  //printf("next235even(%ld) =\t%ld\n",n,next235even(n));
-  //double* a; printf("%g\n",a[0]);  // do deliberate segfault for bash debug!
+  // printf("starting huge next235even...\n");   // 1e11 takes 1 sec
+  // BIGINT n=(BIGINT)120573851963;
+  // printf("next235even(%ld) =\t%ld\n",n,next235even(n));
+  // double* a; printf("%g\n",a[0]);  // do deliberate segfault for bash debug!
 
   // test vector norms and norm difference routines... now pass-fail 6/16/23
   BIGINT M = 1e4;
   std::vector<CPX> a(M), b(M);
-  for (BIGINT j=0; j<M; ++j) {
-    a[j] = CPX(1.0,0.0);
+  for (BIGINT j = 0; j < M; ++j) {
+    a[j] = CPX(1.0, 0.0);
     b[j] = a[j];
   }
-  FLT relerr=2.0*EPSILON;    // 1 ULP, fine since 1.0 rep exactly
-  if (abs(infnorm(M,&a[0]) - 1.0) > relerr) return 1;
-  if (abs(twonorm(M,&a[0]) - sqrt((FLT)M)) > relerr*sqrt((FLT)M)) return 1;
-  b[0] = CPX(0.0,0.0);  // perturb b from a
-  if (abs(errtwonorm(M,&a[0],&b[0]) - 1.0) > relerr) return 1;
-  if (abs(sqrt((FLT)M)* relerrtwonorm(M,&a[0],&b[0]) - 1.0) > relerr) return 1;
+  FLT relerr = 2.0 * EPSILON; // 1 ULP, fine since 1.0 rep exactly
+  if (abs(infnorm(M, &a[0]) - 1.0) > relerr) return 1;
+  if (abs(twonorm(M, &a[0]) - sqrt((FLT)M)) > relerr * sqrt((FLT)M)) return 1;
+  b[0] = CPX(0.0, 0.0); // perturb b from a
+  if (abs(errtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1;
+  if (abs(sqrt((FLT)M) * relerrtwonorm(M, &a[0], &b[0]) - 1.0) > relerr) return 1;
 
 #ifdef SINGLE
   printf("testutilsf passed.\n");
diff --git a/tutorial/migrate2d1_test.c b/tutorial/migrate2d1_test.c
index 3bed802bb..241f8c747 100644
--- a/tutorial/migrate2d1_test.c
+++ b/tutorial/migrate2d1_test.c
@@ -4,57 +4,64 @@
    To compile (assuming FINUFFT include and lib in path):
    gcc migrate2d1_test.c -o migrate2d1_test -lfinufft -lfftw3 -lm
  */
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
 #include <complex.h>
-#include <time.h>
 #include <finufft.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
 
 int main(void) {
-  int N[2] = {300,200};         // N0, N1 output shape in nfft3 sense
-  int M = 500000;               // num. nonuniform input points
-  double tol = 1e-13;           // user must choose (unlike nfft3's simple call)
-  
+  int N[2]   = {300, 200}; // N0, N1 output shape in nfft3 sense
+  int M      = 500000;     // num. nonuniform input points
+  double tol = 1e-13;      // user must choose (unlike nfft3's simple call)
+
   // user allocates all external arrays (and no internal ones)
-  double* x = (double *)malloc(sizeof(double)*M);  // x (0th) coords only here
-  double* y = (double *)malloc(sizeof(double)*M);  // y (1st) coords need separate ptr
-  double complex* f = (double complex*)malloc(sizeof(double complex)*M);
-  double complex* f_hat = (double complex*)malloc(sizeof(double complex)*N[0]*N[1]);  // output
-  
+  double *x = (double *)malloc(sizeof(double) * M); // x (0th) coords only here
+  double *y = (double *)malloc(sizeof(double) * M); // y (1st) coords need separate ptr
+  double complex *f = (double complex *)malloc(sizeof(double complex) * M);
+  double complex *f_hat =
+      (double complex *)malloc(sizeof(double complex) * N[0] * N[1]); // output
+
   // start with exactly the same "user data" as in nfft2d1_test.c...
-  srand(0);                     // fix seed
-  for (int j=0; j<M; ++j) {     // nonequispaced pts, and strengths f...
-    x[j] = (double)rand()/RAND_MAX - 0.5;    // x unif rand in [-1/2,1/2)
-    y[j] = (double)rand()/RAND_MAX - 0.5;    // y "
-    f[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  srand(0);                                 // fix seed
+  for (int j = 0; j < M; ++j) {             // nonequispaced pts, and strengths f...
+    x[j] = (double)rand() / RAND_MAX - 0.5; // x unif rand in [-1/2,1/2)
+    y[j] = (double)rand() / RAND_MAX - 0.5; // y "
+    f[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
-      
+
   clock_t before = clock();
 
   // do transform, includes precompute, writing to f_hat...
-  for (int j=0; j<M; ++j) {          // change user coords so finufft same as nfft3
-    x[j] *= 2*M_PI; y[j] *= 2*M_PI;  // scales from 1-periodic to 2pi-periodic
+  for (int j = 0; j < M; ++j) { // change user coords so finufft same as nfft3
+    x[j] *= 2 * M_PI;
+    y[j] *= 2 * M_PI;           // scales from 1-periodic to 2pi-periodic
   }
-  finufft_opts opts;                 // opts struct
-  finufft_default_opts(&opts);       // set default opts (must start with this)
-  opts.nthreads = 1;                 // enforce single-thread
-  int ier = finufft2d1(M,y,x,f,+1,tol,N[1],N[0],f_hat,&opts);  // both x,y and N0,N1 swapped!
+  finufft_opts opts;            // opts struct
+  finufft_default_opts(&opts);  // set default opts (must start with this)
+  opts.nthreads = 1;            // enforce single-thread
+  int ier = finufft2d1(M, y, x, f, +1, tol, N[1], N[0], f_hat, &opts); // both x,y and
+                                                                       // N0,N1 swapped!
 
-  double secs = (clock()-before)/(double)CLOCKS_PER_SEC;
+  double secs = (clock() - before) / (double)CLOCKS_PER_SEC;
 
   // now test that f_hat is as it would have been if original data were sent to nfft3...
-  int kx=-17, ky=33;            // check one output f_hat(kx,ky) vs direct computation
-  int kxout = kx + N[0]/2;
-  int kyout = ky + N[1]/2;
-  int i = kyout + kxout*N[1];   // the output index (nfft3 convention, not FINUFFT's)
-  double complex f_hat_test = 0.0 + 0.0*I;
-  for (int j=0; j<M; ++j)       // since x,y were mult by 2pi, no such factor here...
-    f_hat_test += f[j] * cexp(I*((double)kx*x[j]+(double)ky*y[j]));
+  int kx = -17, ky = 33; // check one output f_hat(kx,ky) vs direct computation
+  int kxout = kx + N[0] / 2;
+  int kyout = ky + N[1] / 2;
+  int i     = kyout + kxout * N[1]; // the output index (nfft3 convention, not FINUFFT's)
+  double complex f_hat_test = 0.0 + 0.0 * I;
+  for (int j = 0; j < M; ++j)       // since x,y were mult by 2pi, no such factor here...
+    f_hat_test += f[j] * cexp(I * ((double)kx * x[j] + (double)ky * y[j]));
   double err = cabs(f_hat[i] - f_hat_test) / cabs(f_hat_test);
-  printf("2D type 1 (FINUFFT) in %.3g s: f_hat[%d,%d]=%.12g+%.12gi, rel err %.3g\n",
-         secs,kx,ky,creal(f_hat[i]),cimag(f_hat[i]),err);
-  
-  free(x); free(y); free(f); free(f_hat);   // user deallocates own I/O arrays
+  printf("2D type 1 (FINUFFT) in %.3g s: f_hat[%d,%d]=%.12g+%.12gi, rel err %.3g\n", secs,
+         kx, ky, creal(f_hat[i]), cimag(f_hat[i]), err);
+
+  free(x);
+  free(y);
+  free(f);
+  free(f_hat); // user deallocates own I/O arrays
   return ier;
 }
diff --git a/tutorial/nfft2d1_test.c b/tutorial/nfft2d1_test.c
index db6021c1c..2cc7e4eb7 100755
--- a/tutorial/nfft2d1_test.c
+++ b/tutorial/nfft2d1_test.c
@@ -3,46 +3,48 @@
    To compile (assuming nfft3 installed):
    gcc nfft2d1_test.c -o nfft2d1_test -lnfft3 -lfftw3 -lm
  */
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
+#include "nfft3.h"
 #include <complex.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <time.h>
-#include "nfft3.h"
 
 int main(void) {
-  int N[2] = {300,200};         // N1, N2 output mode numbers
-  int M = 500000;               // num. nonuniform input points
-  int dim=2;
+  int N[2] = {300, 200}; // N1, N2 output mode numbers
+  int M    = 500000;     // num. nonuniform input points
+  int dim  = 2;
   nfft_plan p;
-  nfft_init(&p, dim, N, M);     // allocates user I/O arrays too
+  nfft_init(&p, dim, N, M); // allocates user I/O arrays too
   // make some "user data" (we must use arrays that nfft allocated)...
   srand(0);                     // fix seed
-  for (int j=0; j<M; ++j) {     // nonequispaced pts, and strengths f...
-    p.x[2*j] = (double)rand()/RAND_MAX - 0.5;    // x unif rand in [-1/2,1/2)
-    p.x[2*j+1] = (double)rand()/RAND_MAX - 0.5;  // y "
-    p.f[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
+  for (int j = 0; j < M; ++j) { // nonequispaced pts, and strengths f...
+    p.x[2 * j]     = (double)rand() / RAND_MAX - 0.5; // x unif rand in [-1/2,1/2)
+    p.x[2 * j + 1] = (double)rand() / RAND_MAX - 0.5; // y "
+    p.f[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
   }
 
   clock_t before = clock();
-  
-  if(p.flags & PRE_ONE_PSI)     // precompute psi, the entries of the matrix B
+
+  if (p.flags & PRE_ONE_PSI) // precompute psi, the entries of the matrix B
     nfft_precompute_one_psi(&p);
-  nfft_adjoint(&p);             // do transform, write to p.f_hat
+  nfft_adjoint(&p);          // do transform, write to p.f_hat
 
-  double secs = (clock()-before)/(double)CLOCKS_PER_SEC;
-  
-  int kx=-17, ky=33;            // check one output f_hat(kx,ky) vs direct computation
-  int kxout = kx + N[0]/2;
-  int kyout = ky + N[1]/2;
-  int i = kyout + kxout*N[1];   // output index: array ordered x slow, y fast
-  double complex f_hat_test = 0.0 + 0.0*I;
-  for (int j=0; j<M; ++j)       // 2pi fac; p.x array is x interleaved with y...
-    f_hat_test += p.f[j] * cexp(2*M_PI*I*((double)kx*p.x[2*j]+(double)ky*p.x[2*j+1]));
+  double secs = (clock() - before) / (double)CLOCKS_PER_SEC;
+
+  int kx = -17, ky = 33; // check one output f_hat(kx,ky) vs direct computation
+  int kxout = kx + N[0] / 2;
+  int kyout = ky + N[1] / 2;
+  int i     = kyout + kxout * N[1]; // output index: array ordered x slow, y fast
+  double complex f_hat_test = 0.0 + 0.0 * I;
+  for (int j = 0; j < M; ++j)       // 2pi fac; p.x array is x interleaved with y...
+    f_hat_test += p.f[j] * cexp(2 * M_PI * I *
+                                ((double)kx * p.x[2 * j] + (double)ky * p.x[2 * j + 1]));
   double err = cabs(p.f_hat[i] - f_hat_test) / cabs(f_hat_test);
   printf("2D type 1 (NFFT3) done in %.3g s: f_hat[%d,%d]=%.12g+%.12gi, rel err %.3g\n",
-         secs,kx,ky,creal(p.f_hat[i]),cimag(p.f_hat[i]),err);
-  
-  nfft_finalize(&p);            // free the plan
+         secs, kx, ky, creal(p.f_hat[i]), cimag(p.f_hat[i]), err);
+
+  nfft_finalize(&p); // free the plan
   return 0;
 }