From 5aa27053e5cc74f2c2d8346055d54bbf16b57317 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@ccmlin019.flatironinstitute.org>
Date: Wed, 8 May 2024 16:24:18 -0400
Subject: [PATCH] using the new foldrescale

---
 CMakeLists.txt                |  13 +-
 CMakePresets.json             |  19 ++-
 devel/CMakeLists.txt          |  21 +++
 devel/foldrescale.cpp         | 294 ++++++++++++++++++++++++++++++++++
 include/finufft/defs.h        |  10 +-
 include/finufft_errors.h      |   3 +-
 include/finufft_opts.h        |  24 +++
 include/finufft_spread_opts.h |   2 +-
 perftest/spreadtestnd.cpp     |   2 -
 src/finufft.cpp               |  41 +++--
 src/spreadinterp.cpp          | 125 ++++++---------
 11 files changed, 447 insertions(+), 107 deletions(-)
 create mode 100644 devel/CMakeLists.txt
 create mode 100644 devel/foldrescale.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e9a4d3f5..0d7c7130b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disable
 option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF)
 option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
 option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON)
+option(FINUFTT_BUILD_DEVEL "Whether to build developement executables" OFF)
 # sphinx tag (don't remove): @cmake_opts_end
 
 if(FINUFFT_USE_CPU)
@@ -45,10 +46,11 @@ if(FINUFFT_USE_CPU)
     endif()
 
     set(CPM_DOWNLOAD_VERSION 0.38.0)
-    include(cmake/setupCPM.cmake)
-
     set(FFTW_VERSION 3.3.10)
+
+    include(cmake/setupCPM.cmake)
     include(cmake/setupFFTW.cmake)
+
 endif()
 
 if (FINUFFT_BUILD_MATLAB)
@@ -106,11 +108,11 @@ function(finufft_link_test target)
           endif()
       endif()
   else()
-      target_link_libraries(${target} PRIVATE finufft)
       if(WIN32)
           target_compile_definitions(${target} PRIVATE FINUFFT_DLL)
       endif()
   endif()
+  target_link_libraries(${target} PUBLIC finufft)
   enable_asan(${target})
 endfunction()
 
@@ -165,7 +167,6 @@ if(FINUFFT_USE_CPU)
     target_compile_definitions(finufft_f64 PRIVATE)
     set_finufft_options(finufft_f64)
     target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES})
-
     if(WIN32)
         add_library(finufft_f32_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
         target_compile_definitions(finufft_f32_dll PRIVATE SINGLE dll_EXPORTS FINUFFT_DLL)
@@ -246,6 +247,10 @@ if (FINUFFT_BUILD_MATLAB)
     add_subdirectory(matlab)
 endif ()
 
+if (FINUFTT_BUILD_DEVEL)
+    add_subdirectory(devel)
+endif ()
+
 include(GNUInstallDirs)
 install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER)
 install(FILES ${PROJECT_SOURCE_DIR}/LICENSE
diff --git a/CMakePresets.json b/CMakePresets.json
index 117582714..5276f5018 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -31,7 +31,22 @@
             "generator": "Ninja Multi-Config",
             "cacheVariables": {
                 "FINUFFT_BUILD_TESTS": "ON",
-                "FINUFFT_BUILD_EXAMPLES": "ON"
+                "FINUFFT_BUILD_EXAMPLES": "ON",
+                "FINUFTT_BUILD_DEVEL": "ON"
+            }
+        },
+        {
+            "name": "benchmark",
+            "binaryDir": "build/benchmark",
+            "displayName": "Benchmark",
+            "description": "Benchmark release configuration (ninja)",
+            "generator": "Ninja",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "RelWithDebInfo",
+                "FINUFFT_BUILD_TESTS": "ON",
+                "FINUFFT_BUILD_EXAMPLES": "ON",
+                "FINUFFT_FFTW_SUFFIX": "",
+                "FINUFFT_USE_OPENMP": "OFF"
             }
         },
         {
@@ -104,7 +119,7 @@
         {
             "name": "dev",
             "configurePreset": "dev",
-            "configuration": "Debug"
+            "configuration": "RelWithDebInfo"
         },
         {
             "name": "ninja-multi",
diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt
new file mode 100644
index 000000000..01b8e48dd
--- /dev/null
+++ b/devel/CMakeLists.txt
@@ -0,0 +1,21 @@
+project(finufft_devel)
+# Set the minimum required version of CMake
+cmake_minimum_required(VERSION 3.5)
+
+
+# include cpm cmake, downloading it
+CPMAddPackage(
+    NAME benchmark
+    GITHUB_REPOSITORY google/benchmark
+    VERSION 1.8.3
+    OPTIONS "BENCHMARK_ENABLE_TESTING OFF"
+
+)
+
+if (benchmark_ADDED)
+    # patch benchmark target
+    set_target_properties(benchmark PROPERTIES CXX_STANDARD 17)
+endif()
+
+add_executable(foldrescale foldrescale.cpp)
+target_link_libraries(foldrescale finufft benchmark)
\ No newline at end of file
diff --git a/devel/foldrescale.cpp b/devel/foldrescale.cpp
new file mode 100644
index 000000000..e038be252
--- /dev/null
+++ b/devel/foldrescale.cpp
@@ -0,0 +1,294 @@
+#include "finufft/defs.h"
+#include <benchmark/benchmark.h>
+#include <iostream>
+#include <random>
+#include <cmath>
+// no vectorize
+#pragma GCC optimize("no-tree-vectorize")
+/* local NU coord fold+rescale macro: does the following affine transform to x:
+     when p=true:   map [-3pi,-pi) and [-pi,pi) and [pi,3pi)    each to [0,N)
+     otherwise,     map [-N,0) and [0,N) and [N,2N)             each to [0,N)
+   Thus, only one period either side of the principal domain is folded.
+   (It is *so* much faster than slow std::fmod that we stick to it.)
+   This explains FINUFFT's allowed input domain of [-3pi,3pi).
+   Speed comparisons of this macro vs a function are in devel/foldrescale*.
+   The macro wins hands-down on i7, even for modern GCC9.
+   This should be done in C++ not as a macro, someday.
+*/
+#define FOLDRESCALE(x, N, p) (p ?                                         \
+         (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI)) * ((FLT)M_1_2PI*N) : \
+                        (x>=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N))
+
+
+#define FOLDRESCALE04(x, N, p) (p ? \
+   ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5)))  * FLT(N) : \
+    ((x/FLT(N))-floor(x/FLT(N)))*FLT(N))
+
+#define FOLDRESCALE05(x, N, p) FLT(N) * (p ? \
+   ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) : \
+    ((x/FLT(N))-floor(x/FLT(N))))
+
+inline __attribute__((always_inline))
+FLT foldRescale00(FLT x, BIGINT N, bool p) {
+  FLT result;
+  FLT fN = FLT(N);
+  if (p) {
+    static constexpr FLT x2pi = FLT(M_1_2PI);
+    result = x * x2pi + FLT(0.5);
+    result -= floor(result);
+  } else {
+    const FLT invN = FLT(1.0) / fN;
+    result = x * invN;
+    result -= floor(result);
+  }
+  return result * fN;
+}
+
+inline __attribute__((always_inline))
+FLT foldRescale01(FLT x, BIGINT N, bool p) {
+  return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N) :
+         (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
+}
+
+template<bool p>
+inline __attribute__((always_inline))
+FLT foldRescale02(FLT x, BIGINT N) {
+  if constexpr (p) {
+    return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N);
+  } else {
+    return (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N);
+  }
+}
+
+template<bool p>
+inline __attribute__((always_inline))
+FLT foldRescale03(FLT x, BIGINT N) {
+  FLT result;
+  FLT fN = FLT(N);
+  if constexpr (p) {
+    static constexpr FLT x2pi = FLT(M_1_2PI);
+    result = x * x2pi + FLT(0.5);
+    result -= floor(result);
+  } else {
+    const FLT invN = FLT(1.0) / fN;
+    result = x * invN;
+    result -= floor(result);
+  }
+  return result * fN;
+}
+
+static std::mt19937_64 gen;
+static std::uniform_real_distribution<> dis(-3 * M_PI, 3 * M_PI);
+static const auto N = std::uniform_int_distribution<>{0, 1000}(gen);
+static std::uniform_real_distribution<> disN(-N, 2*N);
+static volatile auto pirange = true;
+static volatile auto notPirange = !pirange;
+
+static void BM_BASELINE(benchmark::State &state) {
+  for (auto _: state) {
+    benchmark::DoNotOptimize(dis(gen));
+  }
+}
+
+static void BM_FoldRescaleMacro(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE(x, N, pirange));
+  }
+}
+
+static void BM_FoldRescaleMacroN(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE(x, N, notPirange));
+  }
+}
+
+static void BM_FoldRescale00(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(foldRescale00(x, N, pirange));
+  }
+}
+
+
+static void BM_FoldRescale00N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(foldRescale00(x, N, notPirange));
+  }
+}
+
+
+static void BM_FoldRescale01(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(foldRescale01(x, N, pirange));
+  }
+}
+
+
+static void BM_FoldRescale01N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(foldRescale01(x, N, notPirange));
+  }
+}
+
+static void BM_FoldRescale02(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(foldRescale02<true>(x, N));
+  }
+}
+
+
+static void BM_FoldRescale02N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(foldRescale02<false>(x, N));
+  }
+}
+
+
+static void BM_FoldRescale03(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(foldRescale03<true>(x, N));
+  }
+}
+
+static void BM_FoldRescale03N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(foldRescale03<false>(x, N));
+  }
+}
+
+static void BM_FoldRescale04(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE04(x, N, pirange));
+  }
+}
+
+static void BM_FoldRescale04N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE04(x, N, notPirange));
+  }
+}
+
+static void BM_FoldRescale05(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = dis(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE05(x, N, pirange));
+  }
+}
+
+static void BM_FoldRescale05N(benchmark::State &state) {
+  for (auto _: state) {
+    FLT x = disN(gen);
+    benchmark::DoNotOptimize(FOLDRESCALE05(x, N, notPirange));
+  }
+}
+
+
+BENCHMARK(BM_BASELINE);
+
+BENCHMARK(BM_FoldRescaleMacro);
+BENCHMARK(BM_FoldRescale00);
+BENCHMARK(BM_FoldRescale01);
+BENCHMARK(BM_FoldRescale02);
+BENCHMARK(BM_FoldRescale03);
+BENCHMARK(BM_FoldRescale04);
+BENCHMARK(BM_FoldRescale05);
+
+BENCHMARK(BM_FoldRescaleMacroN);
+BENCHMARK(BM_FoldRescale00N);
+BENCHMARK(BM_FoldRescale01N);
+BENCHMARK(BM_FoldRescale02N);
+BENCHMARK(BM_FoldRescale03N);
+BENCHMARK(BM_FoldRescale04N);
+BENCHMARK(BM_FoldRescale05N);
+
+void testFoldRescaleFunctions() {
+  for (bool p: {true, false}) {
+    for (int i = 0; i < 1024; ++i) {  // Run the test 1000 times
+      FLT x = dis(gen);
+      FLT resultMacro = FOLDRESCALE(x, N, p);
+      FLT result00 = foldRescale00(x, N, p);
+      FLT result01 = foldRescale01(x, N, p);
+      FLT result02 = p ? foldRescale02<true>(x, N) : foldRescale02<false>(x, N);
+      FLT result03 = p ? foldRescale03<true>(x, N) : foldRescale03<false>(x, N);
+      FLT result04 = FOLDRESCALE04(x, N, p);
+      FLT result05 = FOLDRESCALE05(x, N, p);
+      // function that compares two floating point number with a tolerance, using relative error
+      auto compare = [](FLT a, FLT b) {
+        return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13;
+      };
+
+      if (compare(resultMacro, result00)) {
+        std::cout << "resultMacro: " << resultMacro << " result00: " << result00 << std::endl;
+        throw std::runtime_error("function00 is wrong");
+      }
+      if (compare(resultMacro, result01)) {
+        std::cout << "resultMacro: " << resultMacro << " result01: " << result01 << std::endl;
+        throw std::runtime_error("function01 is wrong");
+      }
+      if (compare(resultMacro, result02)) {
+        std::cout << "resultMacro: " << resultMacro << " result02: " << result02 << std::endl;
+        throw std::runtime_error("function02 is wrong");
+      }
+      if (compare(resultMacro, result03)) {
+        std::cout << "resultMacro: " << resultMacro << " result03: " << result03 << std::endl;
+        throw std::runtime_error("function03 is wrong");
+      }
+      if (compare(resultMacro, result04)) {
+        std::cout << "resultMacro: " << resultMacro << " result04: " << result04 << std::endl;
+        throw std::runtime_error("function04 is wrong");
+      }
+      if (compare(resultMacro, result05)) {
+        std::cout << "resultMacro: " << resultMacro << " result05: " << result05 << std::endl;
+        throw std::runtime_error("function05 is wrong");
+      }
+    }
+  }
+}
+
+class BaselineSubtractingReporter : public benchmark::ConsoleReporter {
+public:
+  bool ReportContext(const Context &context) override {
+    return benchmark::ConsoleReporter::ReportContext(context);
+  }
+
+  void ReportRuns(const std::vector<Run> &reports) override {
+    for (const auto &run: reports) {
+      if (run.benchmark_name() == "BM_BASELINE") {
+        baseline_time = run.cpu_accumulated_time;
+      } else {
+        Run modified_run = run;
+        modified_run.cpu_accumulated_time -= baseline_time;
+        benchmark::ConsoleReporter::ReportRuns({modified_run});
+      }
+    }
+  }
+
+private:
+  double baseline_time = 0.0;
+};
+
+int main(int argc, char **argv) {
+  pirange = argc & 1;
+  notPirange = !pirange;
+  static std::random_device rd;
+  const auto seed = rd();
+  std::cout << "Seed: " << seed << "\n";
+//  gen.seed(226307105);
+  gen.seed(seed);
+  testFoldRescaleFunctions();
+  ::benchmark::Initialize(&argc, argv);
+  BaselineSubtractingReporter reporter;
+  ::benchmark::RunSpecifiedBenchmarks(&reporter);
+  return 0;
+}
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index 3af79bed6..8780ff914 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -37,7 +37,15 @@
 #include <complex>          // we define C++ complex type only
 #define CPX std::complex<FLT>
 
-
+// inline macro, to force inlining of small functions
+// this avoids the use of macros to implement functions
+#if defined(_MSC_VER)
+#define FINUFFT_ALWAYS_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline
+#else
+#define FINUFFT_ALWAYS_INLINE inline
+#endif
 
 // ------------- Library-wide algorithm parameter settings ----------------
 
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
index 17305ebf6..b2555d522 100644
--- a/include/finufft_errors.h
+++ b/include/finufft_errors.h
@@ -6,7 +6,7 @@
 #define FINUFFT_WARN_EPS_TOO_SMALL         1
 #define FINUFFT_ERR_MAXNALLOC              2
 #define FINUFFT_ERR_SPREAD_BOX_SMALL       3
-#define FINUFFT_ERR_SPREAD_PTS_OUT_RANGE   4
+#define FINUFFT_ERR_SPREAD_PTS_OUT_RANGE   4 // DEPRECATED
 #define FINUFFT_ERR_SPREAD_ALLOC           5
 #define FINUFFT_ERR_SPREAD_DIR             6
 #define FINUFFT_ERR_UPSAMPFAC_TOO_SMALL    7
@@ -23,4 +23,5 @@
 #define FINUFFT_ERR_BINSIZE_NOTVALID       18
 #define FINUFFT_ERR_INSUFFICIENT_SHMEM     19
 #define FINUFFT_ERR_NUM_NU_PTS_INVALID     20
+#define FINUFFT_WARN_CHKBND_NOT_DEFAULT    21
 #endif
diff --git a/include/finufft_opts.h b/include/finufft_opts.h
index 3a0156000..bb93b8457 100644
--- a/include/finufft_opts.h
+++ b/include/finufft_opts.h
@@ -5,6 +5,30 @@
 #ifndef FINUFFT_OPTS_H
 #define FINUFFT_OPTS_H
 
+// Marco Barbone: 5.8.2024
+// These are user-facing to that the user can reset to the default value
+// TODO: the various options could be macros so that the user does not need to
+//       look up documentation to change to another value
+// Question: would these be be enums?
+
+#define FINUFFT_MODEORD_DEFAULT                  (0)
+#define FINUFFT_CHKBND_DEFAULT                   (1)
+
+#define FINUFFT_DEBUG_DEFAULT                    (0)
+#define FINUFFT_SPREAD_DEBUG_DEFAULT             (0)
+#define FINUFFT_SHOWWARN_DEFAULT                 (1)
+
+#define FINUFFT_NTHREADS_DEFAULT                 (0)
+#define FINUFFT_FFTW_DEFAULT                     (FFTW_ESTIMATE)
+#define FINUFFT_SPREAD_SORT_DEFAULT              (2)
+#define FINUFFT_SPREAD_KEREVALMETH_DEFAULT       (1)
+#define FINUFFT_SPREAD_KERPAD_DEFAULT            (1)
+#define FINUFFT_UPSAMPFAC_DEFAULT                (0.0)
+#define FINUFFT_SPREAD_THREAD_DEFAULT            (0)
+#define FINUFFT_MAXBATCHSIZE_DEFAULT             (0)
+#define FINUFFT_SPREAD_NTHR_ATOMIC_DEFAULT       (-1)
+#define FINUFFT_SPREAD_MAX_SP_SIZE_DEFAULT       (0)
+
 typedef struct finufft_opts{  // defaults see finufft.cpp:finufft_default_opts()
   // sphinx tag (don't remove): @opts_start
   // FINUFFT options:
diff --git a/include/finufft_spread_opts.h b/include/finufft_spread_opts.h
index 2b6f4fbcf..5246bf75d 100644
--- a/include/finufft_spread_opts.h
+++ b/include/finufft_spread_opts.h
@@ -13,7 +13,7 @@ typedef struct finufft_spread_opts {
   int nspread;            // w, the kernel width in grid pts
   int spread_direction;   // 1 means spread NU->U, 2 means interpolate U->NU
   int pirange;            // 0: NU periodic domain is [0,N), 1: domain [-pi,pi)
-  int chkbnds;            // 0: don't check NU pts in 3-period range; 1: do
+  int chkbnds;            // 0: don't check NU pts in 3-period range; 1: do [DEPRECATED]
   int sort;               // 0: don't sort NU pts, 1: do, 2: heuristic choice
   int kerevalmeth;        // 0: direct exp(sqrt()), or 1: Horner ppval, fastest
   int kerpad;             // 0: no pad w to mult of 4, 1: do pad
diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp
index b8c05b1aa..44500fb47 100644
--- a/perftest/spreadtestnd.cpp
+++ b/perftest/spreadtestnd.cpp
@@ -111,8 +111,6 @@ int main(int argc, char* argv[])
     return ier_set;
   }
   opts.pirange = 0;  // crucial, since below has NU pts on [0,Nd] in each dim
-  opts.chkbnds = 0;  // only for debug, since below code has correct bounds);
-                     // however, 1 can make a >5% slowdown for low-tol 3D.
   opts.debug = debug;   // print more diagnostics?
   opts.sort = sort;
   opts.flags = flags;
diff --git a/src/finufft.cpp b/src/finufft.cpp
index a05e1c02e..db3fe71b1 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -136,6 +136,13 @@ int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts
     spopts.atomic_threshold = opts.spread_nthr_atomic;
   if (opts.spread_max_sp_size>0)      // overrides
     spopts.max_subproblem_size = opts.spread_max_sp_size;
+  if (opts.chkbnds != FINUFFT_CHKBND_DEFAULT) {
+      fprintf(stderr, "chkbnds options is deprecated, please use the default value\n");
+      // if other error occurred before this, return that error otherwise return a warning
+      if (!ier) {
+        return FINUFFT_WARN_CHKBND_NOT_DEFAULT;
+      }
+  }
   return ier;
 } 
 
@@ -529,23 +536,23 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
 // Sphinx sucks the below code block into the web docs, hence keep it clean...
 {
   // sphinx tag (don't remove): @defopts_start
-  o->modeord = 0;
-  o->chkbnds = 1;
-
-  o->debug = 0;
-  o->spread_debug = 0;
-  o->showwarn = 1;
-
-  o->nthreads = 0;
-  o->fftw = FFTW_ESTIMATE;
-  o->spread_sort = 2;
-  o->spread_kerevalmeth = 1;
-  o->spread_kerpad = 1;
-  o->upsampfac = 0.0;
-  o->spread_thread = 0;
-  o->maxbatchsize = 0;
-  o->spread_nthr_atomic = -1;
-  o->spread_max_sp_size = 0;
+  o->modeord = FINUFFT_MODEORD_DEFAULT;
+  o->chkbnds = FINUFFT_CHKBND_DEFAULT;
+
+  o->debug = FINUFFT_DEBUG_DEFAULT;
+  o->spread_debug = FINUFFT_SPREAD_DEBUG_DEFAULT;
+  o->showwarn = FINUFFT_SHOWWARN_DEFAULT;
+
+  o->nthreads = FINUFFT_NTHREADS_DEFAULT;
+  o->fftw = FINUFFT_FFTW_DEFAULT;
+  o->spread_sort = FINUFFT_SPREAD_SORT_DEFAULT;
+  o->spread_kerevalmeth = FINUFFT_SPREAD_KEREVALMETH_DEFAULT;
+  o->spread_kerpad = FINUFFT_SPREAD_KERPAD_DEFAULT;
+  o->upsampfac = FINUFFT_UPSAMPFAC_DEFAULT;
+  o->spread_thread = FINUFFT_SPREAD_THREAD_DEFAULT;
+  o->maxbatchsize = FINUFFT_MAXBATCHSIZE_DEFAULT;
+  o->spread_nthr_atomic = FINUFFT_SPREAD_NTHR_ATOMIC_DEFAULT;
+  o->spread_max_sp_size = FINUFFT_SPREAD_MAX_SP_SIZE_DEFAULT;
   // sphinx tag (don't remove): @defopts_end
 }
 
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index 8d6e34d96..7768ae9e2 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -13,30 +13,22 @@
 
 
 /* local NU coord fold+rescale macro: does the following affine transform to x:
-     when p=true:   map [-3pi,-pi) and [-pi,pi) and [pi,3pi)    each to [0,N)
-     otherwise,     map [-N,0) and [0,N) and [N,2N)             each to [0,N)
-   Thus, only one period either side of the principal domain is folded.
-   (It is *so* much faster than slow std::fmod that we stick to it.)
-   This explains FINUFFT's allowed input domain of [-3pi,3pi).
-   Speed comparisons of this macro vs a function are in devel/foldrescale*.
-   The macro wins hands-down on i7, even for modern GCC9.
-   This should be done in C++ not as a macro, someday.
+     when p=true:   x mod PI    each to [0,N)
+     otherwise,     x mod N     each to [0,N)
+   Note: folding big numbers can cause numerical inaccuracies
 */
-#define FOLDRESCALE(x,N,p) (p ?                                         \
-         (x + (x>=-PI ? (x<PI ? PI : -PI) : 3*PI)) * ((FLT)M_1_2PI*N) : \
-                        (x>=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N))
-
 
 using namespace std;
 using namespace finufft::utils;              // access to timer
 
 namespace finufft {
   namespace spreadinterp {
-  
+
 // declarations of purely internal functions... (thus need not be in .h)
-static inline void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts& opts);
-static inline void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts& opts, const int N);
-static inline void eval_kernel_vec_Horner(FLT *ker, const FLT z, const int w, const finufft_spread_opts &opts);
+static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, BIGINT N, bool p) noexcept;
+static FINUFFT_ALWAYS_INLINE void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts& opts);
+static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts& opts, const int N);
+static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(FLT *ker, FLT z, const int w, const finufft_spread_opts &opts);
 void interp_line(FLT *out,FLT *du, FLT *ker,BIGINT i1,BIGINT N1,int ns);
 void interp_square(FLT *out,FLT *du, FLT *ker1, FLT *ker2, BIGINT i1,BIGINT i2,BIGINT N1,BIGINT N2,int ns);
 void interp_cube(FLT *out,FLT *du, FLT *ker1, FLT *ker2, FLT *ker3,
@@ -68,7 +60,6 @@ void get_subgrid(BIGINT &offset1,BIGINT &offset2,BIGINT &offset3,BIGINT &size1,
 		 FLT* kz0,int ns, int ndims);
 
 
-  
 
 // ==========================================================================
 int spreadinterp(
@@ -119,10 +110,9 @@ int spreadinterp(
                 1D, only kx and ky read in 2D).
 
 		These should lie in the box 0<=kx<=N1 etc (if pirange=0),
-                or -pi<=kx<=pi (if pirange=1). However, points up to +-1 period
+                or -pi<=kx<=pi (if pirange=1). However, points
                 outside this domain are also correctly folded back into this
-                domain, but pts beyond this either raise an error (if chkbnds=1)
-                or a crash (if chkbnds=0).
+                domain
    opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h
 
    Inputs/Outputs:
@@ -133,8 +123,6 @@ int spreadinterp(
    0 indicates success; other values have meanings in ../docs/error.rst, with
    following modifications:
       3 : one or more non-trivial box dimensions is less than 2.nspread.
-      4 : nonuniform points outside [-Nm,2*Nm] or [-3pi,3pi] in at least one
-          dimension m=1,2,3.
       5 : failed allocate sort indices
 
    Magland Dec 2016. Barnett openmp version, many speedups 1/16/17-2/16/17
@@ -178,11 +166,9 @@ int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky,
 /* This does just the input checking and reporting for the spreader.
    See spreadinterp() for input arguments and meaning of returned value.
    Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18.
-   Bypass FOLDRESCALE macro which has inevitable rounding err even nr +pi,
-   giving fake invalids well inside the [-3pi,3pi] domain, 4/9/21.
+   Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to [-3pi,3pi)
 */
 {
-  CNTime timer;
   // INPUT CHECKING & REPORTING .... cuboid not too small for spreading?
   int minN = 2*opts.nspread;
   if (N1<minN || (N2>1 && N2<minN) || (N3>1 && N3<minN)) {
@@ -193,36 +179,7 @@ int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky,
     fprintf(stderr,"%s error: opts.spread_direction must be 1 or 2!\n",__func__);
     return FINUFFT_ERR_SPREAD_DIR;
   }
-  int ndims = ndims_from_Ns(N1,N2,N3);
-  
-  // BOUNDS CHECKING .... check NU pts are valid (+-3pi if pirange, or [-N,2N])
-  // exit gracefully as soon as invalid is found.
-  // Note: isfinite() breaks with -Ofast
-  if (opts.chkbnds) {
-    timer.start();
-    for (BIGINT i=0; i<M; ++i) {
-      if ((opts.pirange ? (abs(kx[i])>3.0*PI) : (kx[i]<-N1 || kx[i]>2*N1)) || !isfinite(kx[i])) {
-        fprintf(stderr,"%s NU pt not in valid range (central three periods): kx[%lld]=%.16g, N1=%lld (pirange=%d)\n",__func__, (long long)i, kx[i], (long long)N1,opts.pirange);
-        return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE;
-      }
-    }
-    if (ndims>1)
-      for (BIGINT i=0; i<M; ++i) {
-        if ((opts.pirange ? (abs(ky[i])>3.0*PI) : (ky[i]<-N2 || ky[i]>2*N2)) || !isfinite(ky[i])) {
-          fprintf(stderr,"%s NU pt not in valid range (central three periods): ky[%lld]=%.16g, N2=%lld (pirange=%d)\n",__func__, (long long)i, ky[i], (long long)N2,opts.pirange);
-          return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE;
-        }
-      }
-    if (ndims>2)
-      for (BIGINT i=0; i<M; ++i) {
-        if ((opts.pirange ? (abs(kz[i])>3.0*PI) : (kz[i]<-N3 || kz[i]>2*N3)) || !isfinite(kz[i])) {
-          fprintf(stderr,"%s NU pt not in valid range (central three periods): kz[%lld]=%.16g, N3=%lld (pirange=%d)\n",__func__, (long long)i, kz[i], (long long)N3,opts.pirange);
-          return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE;
-        }
-      }
-    if (opts.debug) printf("\tNU bnds check:\t\t%.3g s\n",timer.elapsedsec());
-  }
-  return 0; 
+  return 0;
 }
 
 
@@ -237,11 +194,8 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M,
 
    Inputs:
     M        - number of input NU points.
-    kx,ky,kz - length-M arrays of real coords of NU pts, in the domain
-               for FOLDRESCALE, which includes [0,N1], [0,N2], [0,N3]
-               respectively, if opts.pirange=0; or [-pi,pi] if opts.pirange=1.
+    kx,ky,kz - length-M arrays of real coords of NU pts.
                (only kz used in 1D, only kx and ky used in 2D.)
-               These must have been bounds-checked already; see spreadcheck.
     N1,N2,N3 - integer sizes of overall box (set N2=N3=1 for 1D, N3=1 for 2D).
                1 = x (fastest), 2 = y (medium), 3 = z (slowest).
     opts     - spreading options struct, see ../include/finufft_spread_opts.h
@@ -381,9 +335,9 @@ int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
         FLT *dd0=(FLT*)malloc(sizeof(FLT)*M0*2);    // complex strength data
         for (BIGINT j=0; j<M0; j++) {           // todo: can avoid this copying?
           BIGINT kk=sort_indices[j+brk[isub]];  // NU pt from subprob index list
-          kx0[j]=FOLDRESCALE(kx[kk],N1,opts.pirange);
-          if (N2>1) ky0[j]=FOLDRESCALE(ky[kk],N2,opts.pirange);
-          if (N3>1) kz0[j]=FOLDRESCALE(kz[kk],N3,opts.pirange);
+          kx0[j]= fold_rescale(kx[kk], N1, opts.pirange);
+          if (N2>1) ky0[j]= fold_rescale(ky[kk], N2, opts.pirange);
+          if (N3>1) kz0[j]= fold_rescale(kz[kk], N3, opts.pirange);
           dd0[j*2]=data_nonuniform[kk*2];     // real part
           dd0[j*2+1]=data_nonuniform[kk*2+1]; // imag part
         }
@@ -474,11 +428,11 @@ int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3,
         for (int ibuf=0; ibuf<bufsize; ibuf++) {
           BIGINT j = sort_indices[i+ibuf];
           jlist[ibuf] = j;
-	  xjlist[ibuf] = FOLDRESCALE(kx[j],N1,opts.pirange);
+	  xjlist[ibuf] = fold_rescale(kx[j], N1, opts.pirange);
 	  if(ndims >=2)
-	    yjlist[ibuf] = FOLDRESCALE(ky[j],N2,opts.pirange);
+	    yjlist[ibuf] = fold_rescale(ky[j], N2, opts.pirange);
 	  if(ndims == 3)
-	    zjlist[ibuf] = FOLDRESCALE(kz[j],N3,opts.pirange);                              
+	    zjlist[ibuf] = fold_rescale(kz[j], N3, opts.pirange);
 	}
       
     // Loop over targets in chunk
@@ -583,7 +537,6 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac,
   // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft)
   opts.spread_direction = 0;    // user should always set to 1 or 2 as desired
   opts.pirange = 1;             // user also should always set this
-  opts.chkbnds = 0;
   opts.sort = 2;                // 2:auto-choice
   opts.kerpad = 0;              // affects only evaluate_kernel_vector
   opts.kerevalmeth = kerevalmeth;
@@ -1218,7 +1171,7 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
  * 
  * Inputs: M - number of input NU points.
  *         kx,ky,kz - length-M arrays of real coords of NU pts, in the domain
- *                    for FOLDRESCALE, which includes [0,N1], [0,N2], [0,N3]
+ *                    for fold_rescale, which includes [0,N1], [0,N2], [0,N3]
  *                    respectively, if pirange=0; or [-pi,pi] if pirange=1.
  *         N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D)
  *         bin_size_x,y,z - what binning box size to use in each dimension
@@ -1248,9 +1201,9 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
   std::vector<BIGINT> counts(nbins,0);  // count how many pts in each bin
   for (BIGINT i=0; i<M; i++) {
     // find the bin index in however many dims are needed
-    BIGINT i1=FOLDRESCALE(kx[i],N1,pirange)/bin_size_x, i2=0, i3=0;
-    if (isky) i2 = FOLDRESCALE(ky[i],N2,pirange)/bin_size_y;
-    if (iskz) i3 = FOLDRESCALE(kz[i],N3,pirange)/bin_size_z;
+    BIGINT i1= fold_rescale(kx[i], N1, pirange) / bin_size_x, i2=0, i3=0;
+    if (isky) i2 = fold_rescale(ky[i], N2, pirange) / bin_size_y;
+    if (iskz) i3 = fold_rescale(kz[i], N3, pirange) / bin_size_z;
     BIGINT bin = i1+nbins1*(i2+nbins2*i3);
     counts[bin]++;
   }
@@ -1265,9 +1218,9 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
   
   for (BIGINT i=0; i<M; i++) {
     // find the bin index (again! but better than using RAM)
-    BIGINT i1=FOLDRESCALE(kx[i],N1,pirange)/bin_size_x, i2=0, i3=0;
-    if (isky) i2 = FOLDRESCALE(ky[i],N2,pirange)/bin_size_y;
-    if (iskz) i3 = FOLDRESCALE(kz[i],N3,pirange)/bin_size_z;
+    BIGINT i1= fold_rescale(kx[i], N1, pirange) / bin_size_x, i2=0, i3=0;
+    if (isky) i2 = fold_rescale(ky[i], N2, pirange) / bin_size_y;
+    if (iskz) i3 = fold_rescale(kz[i], N3, pirange) / bin_size_z;
     BIGINT bin = i1+nbins1*(i2+nbins2*i3);
     ret[counts[bin]] = i;      // fill the inverse map on the fly
     ++counts[bin];             // update the offsets
@@ -1312,9 +1265,9 @@ void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
     my_counts.resize(nbins,0);  // allocate counts[t], now in parallel region
     for (BIGINT i=brk[t]; i<brk[t+1]; i++) {
       // find the bin index in however many dims are needed
-      BIGINT i1=FOLDRESCALE(kx[i],N1,pirange)/bin_size_x, i2=0, i3=0;
-      if (isky) i2 = FOLDRESCALE(ky[i],N2,pirange)/bin_size_y;
-      if (iskz) i3 = FOLDRESCALE(kz[i],N3,pirange)/bin_size_z;
+      BIGINT i1= fold_rescale(kx[i], N1, pirange) / bin_size_x, i2=0, i3=0;
+      if (isky) i2 = fold_rescale(ky[i], N2, pirange) / bin_size_y;
+      if (iskz) i3 = fold_rescale(kz[i], N3, pirange) / bin_size_z;
       BIGINT bin = i1+nbins1*(i2+nbins2*i3);
       ++my_counts[bin];               // no clash btw threads
     }
@@ -1335,9 +1288,9 @@ void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz,
     auto &my_counts(counts[t]);
     for (BIGINT i=brk[t]; i<brk[t+1]; i++) {
       // find the bin index (again! but better than using RAM)
-      BIGINT i1=FOLDRESCALE(kx[i],N1,pirange)/bin_size_x, i2=0, i3=0;
-      if (isky) i2 = FOLDRESCALE(ky[i],N2,pirange)/bin_size_y;
-      if (iskz) i3 = FOLDRESCALE(kz[i],N3,pirange)/bin_size_z;
+      BIGINT i1= fold_rescale(kx[i], N1, pirange) / bin_size_x, i2=0, i3=0;
+      if (isky) i2 = fold_rescale(ky[i], N2, pirange) / bin_size_y;
+      if (iskz) i3 = fold_rescale(kz[i], N3, pirange) / bin_size_z;
       BIGINT bin = i1+nbins1*(i2+nbins2*i3);
       ret[my_counts[bin]] = i;   // inverse is offset for this NU pt and thread
       ++my_counts[bin];          // update the offsets; no thread clash
@@ -1415,5 +1368,19 @@ void get_subgrid(BIGINT &offset1,BIGINT &offset2,BIGINT &offset3,BIGINT &size1,B
   }
 }
 
+FINUFFT_ALWAYS_INLINE FLT fold_rescale(const FLT x, const BIGINT N, const bool p) noexcept {
+  FLT result;
+  FLT fN = FLT(N);
+  if (p) {
+    static constexpr FLT x2pi = FLT(M_1_2PI);
+    result = x * x2pi + FLT(0.5);
+    result -= floor(result);
+  } else {
+    const FLT invN = FLT(1.0) / fN;
+    result = x * invN;
+    result -= floor(result);
+  }
+  return result * fN;
+}
   }   // namespace
 }   // namespace