From 5aa27053e5cc74f2c2d8346055d54bbf16b57317 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 8 May 2024 16:24:18 -0400 Subject: [PATCH] using the new foldrescale --- CMakeLists.txt | 13 +- CMakePresets.json | 19 ++- devel/CMakeLists.txt | 21 +++ devel/foldrescale.cpp | 294 ++++++++++++++++++++++++++++++++++ include/finufft/defs.h | 10 +- include/finufft_errors.h | 3 +- include/finufft_opts.h | 24 +++ include/finufft_spread_opts.h | 2 +- perftest/spreadtestnd.cpp | 2 - src/finufft.cpp | 41 +++-- src/spreadinterp.cpp | 125 ++++++--------- 11 files changed, 447 insertions(+), 107 deletions(-) create mode 100644 devel/CMakeLists.txt create mode 100644 devel/foldrescale.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e9a4d3f5..0d7c7130b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disable option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF) option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON) option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON) +option(FINUFTT_BUILD_DEVEL "Whether to build developement executables" OFF) # sphinx tag (don't remove): @cmake_opts_end if(FINUFFT_USE_CPU) @@ -45,10 +46,11 @@ if(FINUFFT_USE_CPU) endif() set(CPM_DOWNLOAD_VERSION 0.38.0) - include(cmake/setupCPM.cmake) - set(FFTW_VERSION 3.3.10) + + include(cmake/setupCPM.cmake) include(cmake/setupFFTW.cmake) + endif() if (FINUFFT_BUILD_MATLAB) @@ -106,11 +108,11 @@ function(finufft_link_test target) endif() endif() else() - target_link_libraries(${target} PRIVATE finufft) if(WIN32) target_compile_definitions(${target} PRIVATE FINUFFT_DLL) endif() endif() + target_link_libraries(${target} PUBLIC finufft) enable_asan(${target}) endfunction() @@ -165,7 +167,6 @@ if(FINUFFT_USE_CPU) target_compile_definitions(finufft_f64 PRIVATE) set_finufft_options(finufft_f64) target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES}) - if(WIN32) add_library(finufft_f32_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) target_compile_definitions(finufft_f32_dll PRIVATE SINGLE dll_EXPORTS FINUFFT_DLL) @@ -246,6 +247,10 @@ if (FINUFFT_BUILD_MATLAB) add_subdirectory(matlab) endif () +if (FINUFTT_BUILD_DEVEL) + add_subdirectory(devel) +endif () + include(GNUInstallDirs) install(TARGETS ${INSTALL_TARGETS} PUBLIC_HEADER) install(FILES ${PROJECT_SOURCE_DIR}/LICENSE diff --git a/CMakePresets.json b/CMakePresets.json index 117582714..5276f5018 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -31,7 +31,22 @@ "generator": "Ninja Multi-Config", "cacheVariables": { "FINUFFT_BUILD_TESTS": "ON", - "FINUFFT_BUILD_EXAMPLES": "ON" + "FINUFFT_BUILD_EXAMPLES": "ON", + "FINUFTT_BUILD_DEVEL": "ON" + } + }, + { + "name": "benchmark", + "binaryDir": "build/benchmark", + "displayName": "Benchmark", + "description": "Benchmark release configuration (ninja)", + "generator": "Ninja", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo", + "FINUFFT_BUILD_TESTS": "ON", + "FINUFFT_BUILD_EXAMPLES": "ON", + "FINUFFT_FFTW_SUFFIX": "", + "FINUFFT_USE_OPENMP": "OFF" } }, { @@ -104,7 +119,7 @@ { "name": "dev", "configurePreset": "dev", - "configuration": "Debug" + "configuration": "RelWithDebInfo" }, { "name": "ninja-multi", diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt new file mode 100644 index 000000000..01b8e48dd --- /dev/null +++ b/devel/CMakeLists.txt @@ -0,0 +1,21 @@ +project(finufft_devel) +# Set the minimum required version of CMake +cmake_minimum_required(VERSION 3.5) + + +# include cpm cmake, downloading it +CPMAddPackage( + NAME benchmark + GITHUB_REPOSITORY google/benchmark + VERSION 1.8.3 + OPTIONS "BENCHMARK_ENABLE_TESTING OFF" + +) + +if (benchmark_ADDED) + # patch benchmark target + set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) +endif() + +add_executable(foldrescale foldrescale.cpp) +target_link_libraries(foldrescale finufft benchmark) \ No newline at end of file diff --git a/devel/foldrescale.cpp b/devel/foldrescale.cpp new file mode 100644 index 000000000..e038be252 --- /dev/null +++ b/devel/foldrescale.cpp @@ -0,0 +1,294 @@ +#include "finufft/defs.h" +#include +#include +#include +#include +// no vectorize +#pragma GCC optimize("no-tree-vectorize") +/* local NU coord fold+rescale macro: does the following affine transform to x: + when p=true: map [-3pi,-pi) and [-pi,pi) and [pi,3pi) each to [0,N) + otherwise, map [-N,0) and [0,N) and [N,2N) each to [0,N) + Thus, only one period either side of the principal domain is folded. + (It is *so* much faster than slow std::fmod that we stick to it.) + This explains FINUFFT's allowed input domain of [-3pi,3pi). + Speed comparisons of this macro vs a function are in devel/foldrescale*. + The macro wins hands-down on i7, even for modern GCC9. + This should be done in C++ not as a macro, someday. +*/ +#define FOLDRESCALE(x, N, p) (p ? \ + (x + (x>=-PI ? (x=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N)) + + +#define FOLDRESCALE04(x, N, p) (p ? \ + ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) * FLT(N) : \ + ((x/FLT(N))-floor(x/FLT(N)))*FLT(N)) + +#define FOLDRESCALE05(x, N, p) FLT(N) * (p ? \ + ((x * FLT(M_1_2PI) + FLT(0.5)) - floor(x * FLT(M_1_2PI) + FLT(0.5))) : \ + ((x/FLT(N))-floor(x/FLT(N)))) + +inline __attribute__((always_inline)) +FLT foldRescale00(FLT x, BIGINT N, bool p) { + FLT result; + FLT fN = FLT(N); + if (p) { + static constexpr FLT x2pi = FLT(M_1_2PI); + result = x * x2pi + FLT(0.5); + result -= floor(result); + } else { + const FLT invN = FLT(1.0) / fN; + result = x * invN; + result -= floor(result); + } + return result * fN; +} + +inline __attribute__((always_inline)) +FLT foldRescale01(FLT x, BIGINT N, bool p) { + return p ? (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N) : + (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N); +} + +template +inline __attribute__((always_inline)) +FLT foldRescale02(FLT x, BIGINT N) { + if constexpr (p) { + return (x + (x >= -PI ? (x < PI ? PI : -PI) : 3 * PI)) * ((FLT) M_1_2PI * N); + } else { + return (x >= 0.0 ? (x < (FLT) N ? x : x - (FLT) N) : x + (FLT) N); + } +} + +template +inline __attribute__((always_inline)) +FLT foldRescale03(FLT x, BIGINT N) { + FLT result; + FLT fN = FLT(N); + if constexpr (p) { + static constexpr FLT x2pi = FLT(M_1_2PI); + result = x * x2pi + FLT(0.5); + result -= floor(result); + } else { + const FLT invN = FLT(1.0) / fN; + result = x * invN; + result -= floor(result); + } + return result * fN; +} + +static std::mt19937_64 gen; +static std::uniform_real_distribution<> dis(-3 * M_PI, 3 * M_PI); +static const auto N = std::uniform_int_distribution<>{0, 1000}(gen); +static std::uniform_real_distribution<> disN(-N, 2*N); +static volatile auto pirange = true; +static volatile auto notPirange = !pirange; + +static void BM_BASELINE(benchmark::State &state) { + for (auto _: state) { + benchmark::DoNotOptimize(dis(gen)); + } +} + +static void BM_FoldRescaleMacro(benchmark::State &state) { + for (auto _: state) { + FLT x = dis(gen); + benchmark::DoNotOptimize(FOLDRESCALE(x, N, pirange)); + } +} + +static void BM_FoldRescaleMacroN(benchmark::State &state) { + for (auto _: state) { + FLT x = disN(gen); + benchmark::DoNotOptimize(FOLDRESCALE(x, N, notPirange)); + } +} + +static void BM_FoldRescale00(benchmark::State &state) { + for (auto _: state) { + FLT x = dis(gen); + benchmark::DoNotOptimize(foldRescale00(x, N, pirange)); + } +} + + +static void BM_FoldRescale00N(benchmark::State &state) { + for (auto _: state) { + FLT x = disN(gen); + benchmark::DoNotOptimize(foldRescale00(x, N, notPirange)); + } +} + + +static void BM_FoldRescale01(benchmark::State &state) { + for (auto _: state) { + FLT x = dis(gen); + benchmark::DoNotOptimize(foldRescale01(x, N, pirange)); + } +} + + +static void BM_FoldRescale01N(benchmark::State &state) { + for (auto _: state) { + FLT x = disN(gen); + benchmark::DoNotOptimize(foldRescale01(x, N, notPirange)); + } +} + +static void BM_FoldRescale02(benchmark::State &state) { + for (auto _: state) { + FLT x = dis(gen); + benchmark::DoNotOptimize(foldRescale02(x, N)); + } +} + + +static void BM_FoldRescale02N(benchmark::State &state) { + for (auto _: state) { + FLT x = disN(gen); + benchmark::DoNotOptimize(foldRescale02(x, N)); + } +} + + +static void BM_FoldRescale03(benchmark::State &state) { + for (auto _: state) { + FLT x = dis(gen); + benchmark::DoNotOptimize(foldRescale03(x, N)); + } +} + +static void BM_FoldRescale03N(benchmark::State &state) { + for (auto _: state) { + FLT x = disN(gen); + benchmark::DoNotOptimize(foldRescale03(x, N)); + } +} + +static void BM_FoldRescale04(benchmark::State &state) { + for (auto _: state) { + FLT x = dis(gen); + benchmark::DoNotOptimize(FOLDRESCALE04(x, N, pirange)); + } +} + +static void BM_FoldRescale04N(benchmark::State &state) { + for (auto _: state) { + FLT x = disN(gen); + benchmark::DoNotOptimize(FOLDRESCALE04(x, N, notPirange)); + } +} + +static void BM_FoldRescale05(benchmark::State &state) { + for (auto _: state) { + FLT x = dis(gen); + benchmark::DoNotOptimize(FOLDRESCALE05(x, N, pirange)); + } +} + +static void BM_FoldRescale05N(benchmark::State &state) { + for (auto _: state) { + FLT x = disN(gen); + benchmark::DoNotOptimize(FOLDRESCALE05(x, N, notPirange)); + } +} + + +BENCHMARK(BM_BASELINE); + +BENCHMARK(BM_FoldRescaleMacro); +BENCHMARK(BM_FoldRescale00); +BENCHMARK(BM_FoldRescale01); +BENCHMARK(BM_FoldRescale02); +BENCHMARK(BM_FoldRescale03); +BENCHMARK(BM_FoldRescale04); +BENCHMARK(BM_FoldRescale05); + +BENCHMARK(BM_FoldRescaleMacroN); +BENCHMARK(BM_FoldRescale00N); +BENCHMARK(BM_FoldRescale01N); +BENCHMARK(BM_FoldRescale02N); +BENCHMARK(BM_FoldRescale03N); +BENCHMARK(BM_FoldRescale04N); +BENCHMARK(BM_FoldRescale05N); + +void testFoldRescaleFunctions() { + for (bool p: {true, false}) { + for (int i = 0; i < 1024; ++i) { // Run the test 1000 times + FLT x = dis(gen); + FLT resultMacro = FOLDRESCALE(x, N, p); + FLT result00 = foldRescale00(x, N, p); + FLT result01 = foldRescale01(x, N, p); + FLT result02 = p ? foldRescale02(x, N) : foldRescale02(x, N); + FLT result03 = p ? foldRescale03(x, N) : foldRescale03(x, N); + FLT result04 = FOLDRESCALE04(x, N, p); + FLT result05 = FOLDRESCALE05(x, N, p); + // function that compares two floating point number with a tolerance, using relative error + auto compare = [](FLT a, FLT b) { + return std::abs(a - b) > std::max(std::abs(a), std::abs(b)) * 10e-13; + }; + + if (compare(resultMacro, result00)) { + std::cout << "resultMacro: " << resultMacro << " result00: " << result00 << std::endl; + throw std::runtime_error("function00 is wrong"); + } + if (compare(resultMacro, result01)) { + std::cout << "resultMacro: " << resultMacro << " result01: " << result01 << std::endl; + throw std::runtime_error("function01 is wrong"); + } + if (compare(resultMacro, result02)) { + std::cout << "resultMacro: " << resultMacro << " result02: " << result02 << std::endl; + throw std::runtime_error("function02 is wrong"); + } + if (compare(resultMacro, result03)) { + std::cout << "resultMacro: " << resultMacro << " result03: " << result03 << std::endl; + throw std::runtime_error("function03 is wrong"); + } + if (compare(resultMacro, result04)) { + std::cout << "resultMacro: " << resultMacro << " result04: " << result04 << std::endl; + throw std::runtime_error("function04 is wrong"); + } + if (compare(resultMacro, result05)) { + std::cout << "resultMacro: " << resultMacro << " result05: " << result05 << std::endl; + throw std::runtime_error("function05 is wrong"); + } + } + } +} + +class BaselineSubtractingReporter : public benchmark::ConsoleReporter { +public: + bool ReportContext(const Context &context) override { + return benchmark::ConsoleReporter::ReportContext(context); + } + + void ReportRuns(const std::vector &reports) override { + for (const auto &run: reports) { + if (run.benchmark_name() == "BM_BASELINE") { + baseline_time = run.cpu_accumulated_time; + } else { + Run modified_run = run; + modified_run.cpu_accumulated_time -= baseline_time; + benchmark::ConsoleReporter::ReportRuns({modified_run}); + } + } + } + +private: + double baseline_time = 0.0; +}; + +int main(int argc, char **argv) { + pirange = argc & 1; + notPirange = !pirange; + static std::random_device rd; + const auto seed = rd(); + std::cout << "Seed: " << seed << "\n"; +// gen.seed(226307105); + gen.seed(seed); + testFoldRescaleFunctions(); + ::benchmark::Initialize(&argc, argv); + BaselineSubtractingReporter reporter; + ::benchmark::RunSpecifiedBenchmarks(&reporter); + return 0; +} diff --git a/include/finufft/defs.h b/include/finufft/defs.h index 3af79bed6..8780ff914 100644 --- a/include/finufft/defs.h +++ b/include/finufft/defs.h @@ -37,7 +37,15 @@ #include // we define C++ complex type only #define CPX std::complex - +// inline macro, to force inlining of small functions +// this avoids the use of macros to implement functions +#if defined(_MSC_VER) +#define FINUFFT_ALWAYS_INLINE __forceinline +#elif defined(__GNUC__) || defined(__clang__) +#define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline +#else +#define FINUFFT_ALWAYS_INLINE inline +#endif // ------------- Library-wide algorithm parameter settings ---------------- diff --git a/include/finufft_errors.h b/include/finufft_errors.h index 17305ebf6..b2555d522 100644 --- a/include/finufft_errors.h +++ b/include/finufft_errors.h @@ -6,7 +6,7 @@ #define FINUFFT_WARN_EPS_TOO_SMALL 1 #define FINUFFT_ERR_MAXNALLOC 2 #define FINUFFT_ERR_SPREAD_BOX_SMALL 3 -#define FINUFFT_ERR_SPREAD_PTS_OUT_RANGE 4 +#define FINUFFT_ERR_SPREAD_PTS_OUT_RANGE 4 // DEPRECATED #define FINUFFT_ERR_SPREAD_ALLOC 5 #define FINUFFT_ERR_SPREAD_DIR 6 #define FINUFFT_ERR_UPSAMPFAC_TOO_SMALL 7 @@ -23,4 +23,5 @@ #define FINUFFT_ERR_BINSIZE_NOTVALID 18 #define FINUFFT_ERR_INSUFFICIENT_SHMEM 19 #define FINUFFT_ERR_NUM_NU_PTS_INVALID 20 +#define FINUFFT_WARN_CHKBND_NOT_DEFAULT 21 #endif diff --git a/include/finufft_opts.h b/include/finufft_opts.h index 3a0156000..bb93b8457 100644 --- a/include/finufft_opts.h +++ b/include/finufft_opts.h @@ -5,6 +5,30 @@ #ifndef FINUFFT_OPTS_H #define FINUFFT_OPTS_H +// Marco Barbone: 5.8.2024 +// These are user-facing to that the user can reset to the default value +// TODO: the various options could be macros so that the user does not need to +// look up documentation to change to another value +// Question: would these be be enums? + +#define FINUFFT_MODEORD_DEFAULT (0) +#define FINUFFT_CHKBND_DEFAULT (1) + +#define FINUFFT_DEBUG_DEFAULT (0) +#define FINUFFT_SPREAD_DEBUG_DEFAULT (0) +#define FINUFFT_SHOWWARN_DEFAULT (1) + +#define FINUFFT_NTHREADS_DEFAULT (0) +#define FINUFFT_FFTW_DEFAULT (FFTW_ESTIMATE) +#define FINUFFT_SPREAD_SORT_DEFAULT (2) +#define FINUFFT_SPREAD_KEREVALMETH_DEFAULT (1) +#define FINUFFT_SPREAD_KERPAD_DEFAULT (1) +#define FINUFFT_UPSAMPFAC_DEFAULT (0.0) +#define FINUFFT_SPREAD_THREAD_DEFAULT (0) +#define FINUFFT_MAXBATCHSIZE_DEFAULT (0) +#define FINUFFT_SPREAD_NTHR_ATOMIC_DEFAULT (-1) +#define FINUFFT_SPREAD_MAX_SP_SIZE_DEFAULT (0) + typedef struct finufft_opts{ // defaults see finufft.cpp:finufft_default_opts() // sphinx tag (don't remove): @opts_start // FINUFFT options: diff --git a/include/finufft_spread_opts.h b/include/finufft_spread_opts.h index 2b6f4fbcf..5246bf75d 100644 --- a/include/finufft_spread_opts.h +++ b/include/finufft_spread_opts.h @@ -13,7 +13,7 @@ typedef struct finufft_spread_opts { int nspread; // w, the kernel width in grid pts int spread_direction; // 1 means spread NU->U, 2 means interpolate U->NU int pirange; // 0: NU periodic domain is [0,N), 1: domain [-pi,pi) - int chkbnds; // 0: don't check NU pts in 3-period range; 1: do + int chkbnds; // 0: don't check NU pts in 3-period range; 1: do [DEPRECATED] int sort; // 0: don't sort NU pts, 1: do, 2: heuristic choice int kerevalmeth; // 0: direct exp(sqrt()), or 1: Horner ppval, fastest int kerpad; // 0: no pad w to mult of 4, 1: do pad diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp index b8c05b1aa..44500fb47 100644 --- a/perftest/spreadtestnd.cpp +++ b/perftest/spreadtestnd.cpp @@ -111,8 +111,6 @@ int main(int argc, char* argv[]) return ier_set; } opts.pirange = 0; // crucial, since below has NU pts on [0,Nd] in each dim - opts.chkbnds = 0; // only for debug, since below code has correct bounds); - // however, 1 can make a >5% slowdown for low-tol 3D. opts.debug = debug; // print more diagnostics? opts.sort = sort; opts.flags = flags; diff --git a/src/finufft.cpp b/src/finufft.cpp index a05e1c02e..db3fe71b1 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -136,6 +136,13 @@ int setup_spreader_for_nufft(finufft_spread_opts &spopts, FLT eps, finufft_opts spopts.atomic_threshold = opts.spread_nthr_atomic; if (opts.spread_max_sp_size>0) // overrides spopts.max_subproblem_size = opts.spread_max_sp_size; + if (opts.chkbnds != FINUFFT_CHKBND_DEFAULT) { + fprintf(stderr, "chkbnds options is deprecated, please use the default value\n"); + // if other error occurred before this, return that error otherwise return a warning + if (!ier) { + return FINUFFT_WARN_CHKBND_NOT_DEFAULT; + } + } return ier; } @@ -529,23 +536,23 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o) // Sphinx sucks the below code block into the web docs, hence keep it clean... { // sphinx tag (don't remove): @defopts_start - o->modeord = 0; - o->chkbnds = 1; - - o->debug = 0; - o->spread_debug = 0; - o->showwarn = 1; - - o->nthreads = 0; - o->fftw = FFTW_ESTIMATE; - o->spread_sort = 2; - o->spread_kerevalmeth = 1; - o->spread_kerpad = 1; - o->upsampfac = 0.0; - o->spread_thread = 0; - o->maxbatchsize = 0; - o->spread_nthr_atomic = -1; - o->spread_max_sp_size = 0; + o->modeord = FINUFFT_MODEORD_DEFAULT; + o->chkbnds = FINUFFT_CHKBND_DEFAULT; + + o->debug = FINUFFT_DEBUG_DEFAULT; + o->spread_debug = FINUFFT_SPREAD_DEBUG_DEFAULT; + o->showwarn = FINUFFT_SHOWWARN_DEFAULT; + + o->nthreads = FINUFFT_NTHREADS_DEFAULT; + o->fftw = FINUFFT_FFTW_DEFAULT; + o->spread_sort = FINUFFT_SPREAD_SORT_DEFAULT; + o->spread_kerevalmeth = FINUFFT_SPREAD_KEREVALMETH_DEFAULT; + o->spread_kerpad = FINUFFT_SPREAD_KERPAD_DEFAULT; + o->upsampfac = FINUFFT_UPSAMPFAC_DEFAULT; + o->spread_thread = FINUFFT_SPREAD_THREAD_DEFAULT; + o->maxbatchsize = FINUFFT_MAXBATCHSIZE_DEFAULT; + o->spread_nthr_atomic = FINUFFT_SPREAD_NTHR_ATOMIC_DEFAULT; + o->spread_max_sp_size = FINUFFT_SPREAD_MAX_SP_SIZE_DEFAULT; // sphinx tag (don't remove): @defopts_end } diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 8d6e34d96..7768ae9e2 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -13,30 +13,22 @@ /* local NU coord fold+rescale macro: does the following affine transform to x: - when p=true: map [-3pi,-pi) and [-pi,pi) and [pi,3pi) each to [0,N) - otherwise, map [-N,0) and [0,N) and [N,2N) each to [0,N) - Thus, only one period either side of the principal domain is folded. - (It is *so* much faster than slow std::fmod that we stick to it.) - This explains FINUFFT's allowed input domain of [-3pi,3pi). - Speed comparisons of this macro vs a function are in devel/foldrescale*. - The macro wins hands-down on i7, even for modern GCC9. - This should be done in C++ not as a macro, someday. + when p=true: x mod PI each to [0,N) + otherwise, x mod N each to [0,N) + Note: folding big numbers can cause numerical inaccuracies */ -#define FOLDRESCALE(x,N,p) (p ? \ - (x + (x>=-PI ? (x=0.0 ? (x<(FLT)N ? x : x-(FLT)N) : x+(FLT)N)) - using namespace std; using namespace finufft::utils; // access to timer namespace finufft { namespace spreadinterp { - + // declarations of purely internal functions... (thus need not be in .h) -static inline void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts& opts); -static inline void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts& opts, const int N); -static inline void eval_kernel_vec_Horner(FLT *ker, const FLT z, const int w, const finufft_spread_opts &opts); +static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, BIGINT N, bool p) noexcept; +static FINUFFT_ALWAYS_INLINE void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts& opts); +static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts& opts, const int N); +static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(FLT *ker, FLT z, const int w, const finufft_spread_opts &opts); void interp_line(FLT *out,FLT *du, FLT *ker,BIGINT i1,BIGINT N1,int ns); void interp_square(FLT *out,FLT *du, FLT *ker1, FLT *ker2, BIGINT i1,BIGINT i2,BIGINT N1,BIGINT N2,int ns); void interp_cube(FLT *out,FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, @@ -68,7 +60,6 @@ void get_subgrid(BIGINT &offset1,BIGINT &offset2,BIGINT &offset3,BIGINT &size1, FLT* kz0,int ns, int ndims); - // ========================================================================== int spreadinterp( @@ -119,10 +110,9 @@ int spreadinterp( 1D, only kx and ky read in 2D). These should lie in the box 0<=kx<=N1 etc (if pirange=0), - or -pi<=kx<=pi (if pirange=1). However, points up to +-1 period + or -pi<=kx<=pi (if pirange=1). However, points outside this domain are also correctly folded back into this - domain, but pts beyond this either raise an error (if chkbnds=1) - or a crash (if chkbnds=0). + domain opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h Inputs/Outputs: @@ -133,8 +123,6 @@ int spreadinterp( 0 indicates success; other values have meanings in ../docs/error.rst, with following modifications: 3 : one or more non-trivial box dimensions is less than 2.nspread. - 4 : nonuniform points outside [-Nm,2*Nm] or [-3pi,3pi] in at least one - dimension m=1,2,3. 5 : failed allocate sort indices Magland Dec 2016. Barnett openmp version, many speedups 1/16/17-2/16/17 @@ -178,11 +166,9 @@ int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky, /* This does just the input checking and reporting for the spreader. See spreadinterp() for input arguments and meaning of returned value. Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18. - Bypass FOLDRESCALE macro which has inevitable rounding err even nr +pi, - giving fake invalids well inside the [-3pi,3pi] domain, 4/9/21. + Marco Barbone 5.8.24 removed bounds check as new foldrescale is not limited to [-3pi,3pi) */ { - CNTime timer; // INPUT CHECKING & REPORTING .... cuboid not too small for spreading? int minN = 2*opts.nspread; if (N11 && N21 && N33.0*PI) : (kx[i]<-N1 || kx[i]>2*N1)) || !isfinite(kx[i])) { - fprintf(stderr,"%s NU pt not in valid range (central three periods): kx[%lld]=%.16g, N1=%lld (pirange=%d)\n",__func__, (long long)i, kx[i], (long long)N1,opts.pirange); - return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE; - } - } - if (ndims>1) - for (BIGINT i=0; i3.0*PI) : (ky[i]<-N2 || ky[i]>2*N2)) || !isfinite(ky[i])) { - fprintf(stderr,"%s NU pt not in valid range (central three periods): ky[%lld]=%.16g, N2=%lld (pirange=%d)\n",__func__, (long long)i, ky[i], (long long)N2,opts.pirange); - return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE; - } - } - if (ndims>2) - for (BIGINT i=0; i3.0*PI) : (kz[i]<-N3 || kz[i]>2*N3)) || !isfinite(kz[i])) { - fprintf(stderr,"%s NU pt not in valid range (central three periods): kz[%lld]=%.16g, N3=%lld (pirange=%d)\n",__func__, (long long)i, kz[i], (long long)N3,opts.pirange); - return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE; - } - } - if (opts.debug) printf("\tNU bnds check:\t\t%.3g s\n",timer.elapsedsec()); - } - return 0; + return 0; } @@ -237,11 +194,8 @@ int indexSort(BIGINT* sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, Inputs: M - number of input NU points. - kx,ky,kz - length-M arrays of real coords of NU pts, in the domain - for FOLDRESCALE, which includes [0,N1], [0,N2], [0,N3] - respectively, if opts.pirange=0; or [-pi,pi] if opts.pirange=1. + kx,ky,kz - length-M arrays of real coords of NU pts. (only kz used in 1D, only kx and ky used in 2D.) - These must have been bounds-checked already; see spreadcheck. N1,N2,N3 - integer sizes of overall box (set N2=N3=1 for 1D, N3=1 for 2D). 1 = x (fastest), 2 = y (medium), 3 = z (slowest). opts - spreading options struct, see ../include/finufft_spread_opts.h @@ -381,9 +335,9 @@ int spreadSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, FLT *dd0=(FLT*)malloc(sizeof(FLT)*M0*2); // complex strength data for (BIGINT j=0; j1) ky0[j]=FOLDRESCALE(ky[kk],N2,opts.pirange); - if (N3>1) kz0[j]=FOLDRESCALE(kz[kk],N3,opts.pirange); + kx0[j]= fold_rescale(kx[kk], N1, opts.pirange); + if (N2>1) ky0[j]= fold_rescale(ky[kk], N2, opts.pirange); + if (N3>1) kz0[j]= fold_rescale(kz[kk], N3, opts.pirange); dd0[j*2]=data_nonuniform[kk*2]; // real part dd0[j*2+1]=data_nonuniform[kk*2+1]; // imag part } @@ -474,11 +428,11 @@ int interpSorted(BIGINT* sort_indices,BIGINT N1, BIGINT N2, BIGINT N3, for (int ibuf=0; ibuf=2) - yjlist[ibuf] = FOLDRESCALE(ky[j],N2,opts.pirange); + yjlist[ibuf] = fold_rescale(ky[j], N2, opts.pirange); if(ndims == 3) - zjlist[ibuf] = FOLDRESCALE(kz[j],N3,opts.pirange); + zjlist[ibuf] = fold_rescale(kz[j], N3, opts.pirange); } // Loop over targets in chunk @@ -583,7 +537,6 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, // write out default finufft_spread_opts (some overridden in setup_spreader_for_nufft) opts.spread_direction = 0; // user should always set to 1 or 2 as desired opts.pirange = 1; // user also should always set this - opts.chkbnds = 0; opts.sort = 2; // 2:auto-choice opts.kerpad = 0; // affects only evaluate_kernel_vector opts.kerevalmeth = kerevalmeth; @@ -1218,7 +1171,7 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, * * Inputs: M - number of input NU points. * kx,ky,kz - length-M arrays of real coords of NU pts, in the domain - * for FOLDRESCALE, which includes [0,N1], [0,N2], [0,N3] + * for fold_rescale, which includes [0,N1], [0,N2], [0,N3] * respectively, if pirange=0; or [-pi,pi] if pirange=1. * N1,N2,N3 - integer sizes of overall box (N2=N3=1 for 1D, N3=1 for 2D) * bin_size_x,y,z - what binning box size to use in each dimension @@ -1248,9 +1201,9 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, std::vector counts(nbins,0); // count how many pts in each bin for (BIGINT i=0; i